From dfd4e60653d7b17c283de673aaea71408d10641e Mon Sep 17 00:00:00 2001 From: Samarth Agarwal Date: Sun, 24 Nov 2024 19:47:35 -0500 Subject: [PATCH] #14690: Cleaned up gtests under `tests/tt_metal/tt_metal/` to make them more readable, organized and maintainable (#14691) --- CONTRIBUTING.md | 8 +- tests/scripts/run_cpp_unit_tests.sh | 16 +- tests/scripts/run_testpoint_perprocess.py | 2 +- tests/scripts/run_tests.sh | 2 +- tests/scripts/run_tools_tests.sh | 6 +- tests/scripts/t3000/run_t3000_unit_tests.sh | 14 +- tests/scripts/tg/run_tg_unit_tests.sh | 8 +- tests/scripts/tgg/run_tgg_unit_tests.sh | 6 +- .../tensors/test_async_tensor_apis.cpp | 10 +- tests/tt_metal/tt_metal/CMakeLists.txt | 28 +- tests/tt_metal/tt_metal/README.md | 101 ++ tests/tt_metal/tt_metal/api/CMakeLists.txt | 60 ++ .../allocator/test_free_list_allocator.cpp | 9 +- .../allocator/test_l1_banking_allocator.cpp | 6 +- .../buffer_test_utils.hpp} | 13 +- .../circular_buffer_test_utils.hpp | 0 .../test_CircularBuffer_allocation.cpp | 19 +- .../test_CircularBuffer_creation.cpp | 4 +- .../test_CircularBuffer_non_blocking.cpp | 2 +- ...ogram_with_kernel_path_env_var_fixture.hpp | 107 +++ .../core_coord}/core_coord_fixture.hpp | 5 +- .../test_CoreRangeSet_construct.cpp | 4 +- .../core_coord/test_CoreRangeSet_contains.cpp | 4 +- .../test_CoreRangeSet_intersects.cpp | 4 +- .../core_coord/test_CoreRangeSet_merge.cpp | 28 +- .../core_coord/test_CoreRange_adjacent.cpp | 4 +- .../core_coord/test_CoreRange_contains.cpp | 4 +- .../core_coord/test_CoreRange_intersects.cpp | 4 +- .../core_coord/test_CoreRange_iterator.cpp | 2 +- .../core_coord/test_CoreRange_merge.cpp | 4 +- .../tt_metal/api/test_CommandQueue.cpp | 151 +++ .../buffer => api}/test_banked.cpp | 40 +- .../common => api}/test_bit_utils.cpp | 8 +- .../dram/direct.cpp => api/test_direct.cpp} | 8 +- .../dram => api}/test_dram.cpp | 12 +- .../test_dram_to_l1_multicast.cpp | 16 +- .../test_global_semaphores.cpp | 0 .../tt_metal/api/test_kernel_creation.cpp | 107 +++ .../{unit_tests/basic => api}/test_noc.cpp | 56 +- .../test_runtime_args.cpp} | 23 +- .../test_semaphores.cpp} | 10 +- .../test_sharded_l1_buffer.cpp} | 8 +- .../test_simple_dram_buffer.cpp | 8 +- .../buffer => api}/test_simple_l1_buffer.cpp | 19 +- .../basic => api}/test_soc_descriptor.cpp | 9 +- .../test_tilize_untilize.cpp | 17 +- .../test_worker_config_buffer.cpp | 10 +- .../tt_metal/common/command_queue_fixture.hpp | 161 ++++ .../tt_metal/common/device_fixture.hpp | 102 ++ .../dispatch_fixture.hpp} | 43 +- .../matmul_test_utils.hpp} | 7 +- .../tt_metal/common/multi_device_fixture.hpp | 50 + .../CMakeLists.txt | 49 +- .../debug_tools_fixture.hpp} | 125 ++- .../debug_tools_test_utils.hpp} | 55 +- .../dprint/test_eth_cores.cpp | 8 +- .../dprint/test_invalid_print_core.cpp | 16 +- .../dprint/test_mute_device.cpp | 6 +- .../dprint/test_mute_print_server.cpp | 6 +- .../dprint/test_print_all_harts.cpp | 6 +- .../dprint/test_print_before_finish.cpp | 6 +- .../dprint/test_print_hanging.cpp | 6 +- .../dprint/test_print_tensix_dest.cpp | 12 +- .../dprint/test_print_tiles.cpp | 4 +- .../dprint/test_raise_wait.cpp | 6 +- .../watcher/test_assert.cpp | 27 +- .../watcher/test_link_training.cpp | 6 +- .../watcher/test_noc_sanitize.cpp | 16 +- .../watcher/test_noc_sanitize_delays.cpp | 6 +- .../watcher/test_pause.cpp | 6 +- .../watcher/test_ringbuf.cpp | 27 +- .../watcher/test_waypoint.cpp | 4 +- tests/tt_metal/tt_metal/device/CMakeLists.txt | 29 + .../tt_metal/device/galaxy_fixture.hpp | 100 ++ .../device.cpp => device/test_device.cpp} | 57 +- .../test_device_cluster_api.cpp} | 17 +- .../test_device_init_and_teardown.cpp} | 9 +- .../tt_metal/device/test_device_pool.cpp | 131 +++ .../test_galaxy_cluster_api.cpp} | 10 +- .../tt_metal/tt_metal/dispatch/CMakeLists.txt | 33 + .../dispatch/dispatch_buffer/CMakeLists.txt | 34 + ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 443 ++++++++- .../dispatch_buffer/test_sub_device.cpp | 108 +++ .../dispatch/dispatch_event/CMakeLists.txt | 34 + .../test_EnqueueWaitForEvent.cpp | 21 +- .../dispatch_event}/test_events.cpp | 24 +- .../dispatch/dispatch_program/CMakeLists.txt | 36 + ...ith_kernel_created_from_string_fixture.hpp | 27 + .../dispatch_program}/test_EnqueueProgram.cpp | 325 ++++++- .../dispatch_program}/test_dispatch.cpp | 16 +- ...ogram_with_kernel_created_from_string.cpp} | 33 +- .../test_dispatch_stress.cpp} | 44 +- .../dispatch_program/test_sub_device.cpp | 127 +++ .../tt_metal/dispatch/dispatch_test_utils.hpp | 93 ++ .../dispatch/dispatch_trace/CMakeLists.txt | 34 + .../dispatch_trace}/test_EnqueueTrace.cpp | 195 +++- .../dispatch_trace/test_sub_device.cpp | 269 ++++++ .../dispatch/multi_command_queue_fixture.hpp | 150 +++ .../random_program_fixture.hpp} | 171 +--- .../dispatch/sub_device_test_utils.hpp | 121 +++ tests/tt_metal/tt_metal/eth/CMakeLists.txt | 28 + .../tt_metal/tt_metal/eth/test_basic_eth.cpp | 454 +++++++++ .../test_buffer_movement_kernels.cpp} | 205 +++- .../eth/test_erisc_app_direct_send.cpp | 835 ++++++++++++++++ .../test_ring_gather_kernels.cpp} | 6 +- .../gtest_smoke/test_basic_pipeline.cpp | 1 - .../tt_metal/gtest_smoke/test_device.cpp | 1 - .../tt_metal/gtest_smoke/test_flatten.cpp | 1 - .../gtest_smoke/test_matmul_large_block.cpp | 1 - .../test_matmul_multi_core_X_dram.cpp | 1 - .../tt_metal/gtest_smoke/tests_main.cpp | 9 - .../tt_metal/integration/CMakeLists.txt | 34 + .../matmul/test_matmul_X_tile.cpp | 20 +- .../matmul/test_matmul_large_block.cpp | 13 +- .../matmul/test_matmul_multi_core_X_dram.cpp | 14 +- ...ti_core_multi_dram_in0_mcast_in1_mcast.cpp | 10 +- ...matmul_multi_core_multi_dram_inX_mcast.cpp | 12 +- .../matmul/test_matmul_single_core.cpp | 14 +- .../test_autonomous_relay_streams.cpp | 26 +- .../test_basic_pipeline.cpp} | 13 +- .../compute => integration}/test_flatten.cpp | 163 +++- .../test_sfpu_compute.cpp} | 11 +- tests/tt_metal/tt_metal/llk/CMakeLists.txt | 36 + .../compute => llk}/test_broadcast.cpp | 6 +- .../test_copy_block_matmul_partials.cpp | 6 +- .../compute => llk}/test_cumsum.cpp | 4 +- .../test_dropout_sfpu_compute.cpp | 2 +- .../compute => llk}/test_golden_impls.cpp | 0 .../compute => llk}/test_golden_impls.hpp | 0 .../compute => llk}/test_reconfig.cpp | 4 +- .../compute => llk}/test_reduce.cpp | 18 +- .../compute => llk}/test_sfpu_compute.cpp | 12 +- .../test_single_core_binary_compute.cpp | 30 +- .../test_single_core_matmul_compute.cpp | 8 +- .../compute => llk}/test_transpose.cpp | 4 +- .../compute => llk}/test_untilize_tilize.cpp | 18 +- .../test_dram_read_remote_cb.cpp | 2 +- .../test_remote_cb_sync_matmul.cpp | 2 +- .../1_compute_mm/test_compute_mm.cpp | 4 +- tests/tt_metal/tt_metal/stl/CMakeLists.txt | 26 + .../tt_stl => stl}/test_any_range.cpp | 0 .../slotmap.cpp => stl/test_slotmap.cpp} | 0 .../test_enqueue_program.cpp | 0 .../tt_metal/test_kernel_path_env_var.cpp | 134 --- .../misc/sub_device}/incrementer.cpp | 0 .../sub_device}/persistent_remote_waiter.cpp | 0 .../misc/sub_device}/persistent_waiter.cpp | 0 .../misc/sub_device}/syncer.cpp | 0 .../tt_metal/unit_tests/CMakeLists.txt | 97 -- tests/tt_metal/tt_metal/unit_tests/README.md | 44 - .../unit_tests/buffer/test_buffer_utils.hpp | 15 - .../unit_tests/common/basic_fixture.hpp | 33 - .../unit_tests/common/device_fixture.hpp | 186 ---- .../unit_tests/common/n300_device_fixture.hpp | 50 - .../unit_tests/ethernet/basic_eth_kernels.cpp | 895 ------------------ .../ethernet/erisc_app_direct_send.cpp | 278 ------ .../fast_dispatch_kernels/test_write_host.cpp | 260 ----- .../tt_metal/unit_tests/tests_main.cpp | 5 - .../basic/test_kernel_creation.cpp | 79 -- .../common/dprint_fixture.hpp | 95 -- .../unit_tests_fast_dispatch/CMakeLists.txt | 40 - .../unit_tests_fast_dispatch/README.md | 44 - .../command_queue_test_utils.hpp | 32 - .../command_queue/test_CommandQueue.cpp | 73 -- .../command_queue/test_HostAsyncCQ.cpp | 348 ------- .../multichip/test_device_pool.cpp | 136 --- .../multichip/test_eth_EnqueueProgram.cpp | 735 -------------- .../test_eth_ring_gather_EnqueueProgram.cpp | 495 ---------- .../sub_device/test_sub_device.cpp | 585 ------------ .../unit_tests_fast_dispatch/tests_main.cpp | 5 - .../CMakeLists.txt | 26 - .../command_queue/test_EnqueueProgram.cpp | 273 ------ .../command_queue/test_EnqueueTrace.cpp | 241 ----- ...queueWriteBuffer_and_EnqueueReadBuffer.cpp | 355 ------- .../common/command_queue_fixture.hpp | 120 --- .../common/command_queue_test_utils.hpp | 38 - .../tests_main.cpp | 5 - .../unit_tests_frequent/CMakeLists.txt | 26 - .../unit_tests_frequent/tests_main.cpp | 5 - tests/ttnn/unit_tests/gtests/test_add.cpp | 2 +- .../ttnn/unit_tests/gtests/test_graph_add.cpp | 2 +- 181 files changed, 5416 insertions(+), 6832 deletions(-) create mode 100644 tests/tt_metal/tt_metal/README.md create mode 100644 tests/tt_metal/tt_metal/api/CMakeLists.txt rename tests/tt_metal/tt_metal/{unit_tests => api}/allocator/test_free_list_allocator.cpp (97%) rename tests/tt_metal/tt_metal/{unit_tests => api}/allocator/test_l1_banking_allocator.cpp (92%) rename tests/tt_metal/tt_metal/{unit_tests/buffer/test_buffer_utils.cpp => api/buffer_test_utils.hpp} (76%) rename tests/tt_metal/tt_metal/{unit_tests => api}/circular_buffer/circular_buffer_test_utils.hpp (100%) rename tests/tt_metal/tt_metal/{unit_tests => api}/circular_buffer/test_CircularBuffer_allocation.cpp (97%) rename tests/tt_metal/tt_metal/{unit_tests => api}/circular_buffer/test_CircularBuffer_creation.cpp (97%) rename tests/tt_metal/tt_metal/{unit_tests => api}/circular_buffer/test_CircularBuffer_non_blocking.cpp (98%) create mode 100644 tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp rename tests/tt_metal/tt_metal/{unit_tests/common => api/core_coord}/core_coord_fixture.hpp (89%) rename tests/tt_metal/tt_metal/{unit_tests => api}/core_coord/test_CoreRangeSet_construct.cpp (86%) rename tests/tt_metal/tt_metal/{unit_tests => api}/core_coord/test_CoreRangeSet_contains.cpp (96%) rename tests/tt_metal/tt_metal/{unit_tests => api}/core_coord/test_CoreRangeSet_intersects.cpp (96%) rename tests/tt_metal/tt_metal/{unit_tests => api}/core_coord/test_CoreRangeSet_merge.cpp (73%) rename tests/tt_metal/tt_metal/{unit_tests => api}/core_coord/test_CoreRange_adjacent.cpp (91%) rename tests/tt_metal/tt_metal/{unit_tests => api}/core_coord/test_CoreRange_contains.cpp (94%) rename tests/tt_metal/tt_metal/{unit_tests => api}/core_coord/test_CoreRange_intersects.cpp (90%) rename tests/tt_metal/tt_metal/{unit_tests => api}/core_coord/test_CoreRange_iterator.cpp (97%) rename tests/tt_metal/tt_metal/{unit_tests => api}/core_coord/test_CoreRange_merge.cpp (92%) create mode 100644 tests/tt_metal/tt_metal/api/test_CommandQueue.cpp rename tests/tt_metal/tt_metal/{unit_tests/buffer => api}/test_banked.cpp (93%) rename tests/tt_metal/tt_metal/{unit_tests_common/common => api}/test_bit_utils.cpp (95%) rename tests/tt_metal/tt_metal/{unit_tests/dram/direct.cpp => api/test_direct.cpp} (98%) rename tests/tt_metal/tt_metal/{unit_tests_common/dram => api}/test_dram.cpp (93%) rename tests/tt_metal/tt_metal/{unit_tests_common/dram => api}/test_dram_to_l1_multicast.cpp (94%) rename tests/tt_metal/tt_metal/{unit_tests/global_semaphore => api}/test_global_semaphores.cpp (100%) create mode 100644 tests/tt_metal/tt_metal/api/test_kernel_creation.cpp rename tests/tt_metal/tt_metal/{unit_tests/basic => api}/test_noc.cpp (78%) rename tests/tt_metal/tt_metal/{unit_tests/basic/runtime_args.cpp => api/test_runtime_args.cpp} (97%) rename tests/tt_metal/tt_metal/{unit_tests/basic/initialize_semaphores.cpp => api/test_semaphores.cpp} (96%) rename tests/tt_metal/tt_metal/{unit_tests/buffer/test_sharded_l1.cpp => api/test_sharded_l1_buffer.cpp} (97%) rename tests/tt_metal/tt_metal/{unit_tests/buffer => api}/test_simple_dram_buffer.cpp (95%) rename tests/tt_metal/tt_metal/{unit_tests/buffer => api}/test_simple_l1_buffer.cpp (95%) rename tests/tt_metal/tt_metal/{unit_tests/basic => api}/test_soc_descriptor.cpp (95%) rename tests/tt_metal/tt_metal/{unit_tests/host_apis => api}/test_tilize_untilize.cpp (84%) rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/command_queue => api}/test_worker_config_buffer.cpp (87%) create mode 100644 tests/tt_metal/tt_metal/common/command_queue_fixture.hpp create mode 100644 tests/tt_metal/tt_metal/common/device_fixture.hpp rename tests/tt_metal/tt_metal/{unit_tests_common/common/common_fixture.hpp => common/dispatch_fixture.hpp} (88%) rename tests/tt_metal/tt_metal/{unit_tests_common/compute/matmul/matmul_utils.hpp => common/matmul_test_utils.hpp} (97%) create mode 100644 tests/tt_metal/tt_metal/common/multi_device_fixture.hpp rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/CMakeLists.txt (50%) rename tests/tt_metal/tt_metal/{unit_tests_common/common/watcher_fixture.hpp => debug_tools/debug_tools_fixture.hpp} (53%) rename tests/tt_metal/tt_metal/{unit_tests_common/common/test_utils.hpp => debug_tools/debug_tools_test_utils.hpp} (75%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/dprint/test_eth_cores.cpp (95%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/dprint/test_invalid_print_core.cpp (65%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/dprint/test_mute_device.cpp (95%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/dprint/test_mute_print_server.cpp (94%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/dprint/test_print_all_harts.cpp (98%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/dprint/test_print_before_finish.cpp (95%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/dprint/test_print_hanging.cpp (95%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/dprint/test_print_tensix_dest.cpp (97%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/dprint/test_print_tiles.cpp (99%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/dprint/test_raise_wait.cpp (97%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/watcher/test_assert.cpp (91%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/watcher/test_link_training.cpp (93%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/watcher/test_noc_sanitize.cpp (96%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/watcher/test_noc_sanitize_delays.cpp (98%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/watcher/test_pause.cpp (97%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/watcher/test_ringbuf.cpp (88%) rename tests/tt_metal/tt_metal/{unit_tests_common => debug_tools}/watcher/test_waypoint.cpp (99%) create mode 100644 tests/tt_metal/tt_metal/device/CMakeLists.txt create mode 100644 tests/tt_metal/tt_metal/device/galaxy_fixture.hpp rename tests/tt_metal/tt_metal/{unit_tests/basic/device.cpp => device/test_device.cpp} (85%) rename tests/tt_metal/tt_metal/{unit_tests/ethernet/device_cluster_api.cpp => device/test_device_cluster_api.cpp} (93%) rename tests/tt_metal/tt_metal/{unit_tests_common/basic/test_device_init.cpp => device/test_device_init_and_teardown.cpp} (93%) create mode 100644 tests/tt_metal/tt_metal/device/test_device_pool.cpp rename tests/tt_metal/tt_metal/{unit_tests/ethernet/galaxy_cluster_api.cpp => device/test_galaxy_cluster_api.cpp} (97%) create mode 100644 tests/tt_metal/tt_metal/dispatch/CMakeLists.txt create mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_buffer/CMakeLists.txt rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/command_queue => dispatch/dispatch_buffer}/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp (63%) create mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp create mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_event/CMakeLists.txt rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch_single_chip_multi_queue/command_queue => dispatch/dispatch_event}/test_EnqueueWaitForEvent.cpp (96%) rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/command_queue => dispatch/dispatch_event}/test_events.cpp (95%) create mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_program/CMakeLists.txt create mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_program/program_with_kernel_created_from_string_fixture.hpp rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/command_queue => dispatch/dispatch_program}/test_EnqueueProgram.cpp (80%) rename tests/tt_metal/tt_metal/{unit_tests_common/common => dispatch/dispatch_program}/test_dispatch.cpp (95%) rename tests/tt_metal/tt_metal/{test_create_kernel_from_string.cpp => dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp} (76%) rename tests/tt_metal/tt_metal/{unit_tests_frequent/tests/run_many_times.cpp => dispatch/dispatch_program/test_dispatch_stress.cpp} (80%) create mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp create mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp create mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_trace/CMakeLists.txt rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/command_queue => dispatch/dispatch_trace}/test_EnqueueTrace.cpp (80%) create mode 100644 tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp create mode 100644 tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/common/command_queue_fixture.hpp => dispatch/random_program_fixture.hpp} (72%) create mode 100644 tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp create mode 100644 tests/tt_metal/tt_metal/eth/CMakeLists.txt create mode 100644 tests/tt_metal/tt_metal/eth/test_basic_eth.cpp rename tests/tt_metal/tt_metal/{unit_tests/ethernet/buffer_movement_kernels.cpp => eth/test_buffer_movement_kernels.cpp} (62%) create mode 100644 tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp rename tests/tt_metal/tt_metal/{unit_tests/ethernet/ring_gather_kernels.cpp => eth/test_ring_gather_kernels.cpp} (99%) delete mode 120000 tests/tt_metal/tt_metal/gtest_smoke/test_basic_pipeline.cpp delete mode 120000 tests/tt_metal/tt_metal/gtest_smoke/test_device.cpp delete mode 120000 tests/tt_metal/tt_metal/gtest_smoke/test_flatten.cpp delete mode 120000 tests/tt_metal/tt_metal/gtest_smoke/test_matmul_large_block.cpp delete mode 120000 tests/tt_metal/tt_metal/gtest_smoke/test_matmul_multi_core_X_dram.cpp delete mode 100644 tests/tt_metal/tt_metal/gtest_smoke/tests_main.cpp create mode 100644 tests/tt_metal/tt_metal/integration/CMakeLists.txt rename tests/tt_metal/tt_metal/{unit_tests_common/compute => integration}/matmul/test_matmul_X_tile.cpp (97%) rename tests/tt_metal/tt_metal/{unit_tests_common/compute => integration}/matmul/test_matmul_large_block.cpp (97%) rename tests/tt_metal/tt_metal/{unit_tests_common/compute => integration}/matmul/test_matmul_multi_core_X_dram.cpp (98%) rename tests/tt_metal/tt_metal/{unit_tests_common/compute => integration}/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp (98%) rename tests/tt_metal/tt_metal/{unit_tests_common/compute => integration}/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp (98%) rename tests/tt_metal/tt_metal/{unit_tests_common/compute => integration}/matmul/test_matmul_single_core.cpp (95%) rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/streams => integration}/test_autonomous_relay_streams.cpp (97%) rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp => integration/test_basic_pipeline.cpp} (96%) rename tests/tt_metal/tt_metal/{unit_tests_common/compute => integration}/test_flatten.cpp (54%) rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp => integration/test_sfpu_compute.cpp} (97%) create mode 100644 tests/tt_metal/tt_metal/llk/CMakeLists.txt rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_broadcast.cpp (98%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_copy_block_matmul_partials.cpp (98%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_cumsum.cpp (98%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_dropout_sfpu_compute.cpp (99%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_golden_impls.cpp (100%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_golden_impls.hpp (100%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_reconfig.cpp (99%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_reduce.cpp (98%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_sfpu_compute.cpp (97%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_single_core_binary_compute.cpp (95%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_single_core_matmul_compute.cpp (98%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_transpose.cpp (98%) rename tests/tt_metal/tt_metal/{unit_tests/compute => llk}/test_untilize_tilize.cpp (97%) create mode 100644 tests/tt_metal/tt_metal/stl/CMakeLists.txt rename tests/tt_metal/tt_metal/{unit_tests/tt_stl => stl}/test_any_range.cpp (100%) rename tests/tt_metal/tt_metal/{unit_tests/tt_stl/slotmap.cpp => stl/test_slotmap.cpp} (100%) rename tests/tt_metal/tt_metal/{tt_dispatch => }/test_enqueue_program.cpp (100%) delete mode 100644 tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/sub_device/kernels => test_kernels/misc/sub_device}/incrementer.cpp (100%) rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/sub_device/kernels => test_kernels/misc/sub_device}/persistent_remote_waiter.cpp (100%) rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/sub_device/kernels => test_kernels/misc/sub_device}/persistent_waiter.cpp (100%) rename tests/tt_metal/tt_metal/{unit_tests_fast_dispatch/sub_device/kernels => test_kernels/misc/sub_device}/syncer.cpp (100%) delete mode 100644 tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt delete mode 100644 tests/tt_metal/tt_metal/unit_tests/README.md delete mode 100644 tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.hpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests/ethernet/basic_eth_kernels.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests/ethernet/erisc_app_direct_send.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests/tests_main.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_common/basic/test_kernel_creation.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch/README.md delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_device_pool.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/test_sub_device.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch/tests_main.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/tests_main.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_frequent/CMakeLists.txt delete mode 100644 tests/tt_metal/tt_metal/unit_tests_frequent/tests_main.cpp diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 921f8f8d16b..7b9fc84dca8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -251,8 +251,8 @@ The new fangled way we run our tests is with Googletest. The way we generally structure our tests with this framework is to bundle it into a single executable. -You can use `--gtest_filter_test` to filter out the specific test you'd like. -For example, to build and run the `CommonFixture.DRAMLoopbackSingleCore` on +You can use `--gtest_filter` to filter out the specific test you'd like. +For example, to build and run the `DispatchFixture.TensixDRAMLoopbackSingleCore` on fast dispatch, you can 1. Build the unit tests: @@ -261,7 +261,7 @@ fast dispatch, you can ``` 2. Run the test: ``` - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommonFixture.DRAMLoopbackSingleCore" + ./build/test/tt_metal/unit_tests_api --gtest_filter="DispatchFixture.TensixDRAMLoopbackSingleCore" ``` On slow dispatch, to run another specific test, the equivalent would be: @@ -270,7 +270,7 @@ On slow dispatch, to run another specific test, the equivalent would be: 2. Run with the slow dispatch mode: ``` export TT_METAL_SLOW_DISPATCH_MODE=1 - ./build/test/tt_metal/unit_tests/fast_dispatch --gtest_filter_test="BasicFixture.TestL1BuffersAllocatedTopDown" + ./build/test/tt_metal/unit_tests/unit_tests_api --gtest_filter="DeviceSingleCardBufferFixture.TestL1BuffersAllocatedTopDown" ``` We have split our tests into the two dispatch modes for less pollution of state diff --git a/tests/scripts/run_cpp_unit_tests.sh b/tests/scripts/run_cpp_unit_tests.sh index 7da1c173021..ff24af920f8 100755 --- a/tests/scripts/run_cpp_unit_tests.sh +++ b/tests/scripts/run_cpp_unit_tests.sh @@ -9,19 +9,25 @@ fi kernel_path="/tmp/kernels" mkdir -p $kernel_path -TT_METAL_KERNEL_PATH=$kernel_path ./build/test/tt_metal/test_kernel_path_env_var +TT_METAL_KERNEL_PATH=$kernel_path ./build/test/tt_metal/unit_tests_api --gtest_filter=CompileProgramWithKernelPathEnvVarFixture.* rm -rf $kernel_path +./build/test/tt_metal/unit_tests_api +./build/test/tt_metal/unit_tests_debug_tools +./build/test/tt_metal/unit_tests_device +./build/test/tt_metal/unit_tests_dispatch +./build/test/tt_metal/unit_tests_eth +./build/test/tt_metal/unit_tests_llk +./build/test/tt_metal/unit_tests_stl + if [[ ! -z "$TT_METAL_SLOW_DISPATCH_MODE" ]]; then - ./build/test/tt_metal/unit_tests env python tests/scripts/run_tt_metal.py --dispatch-mode slow env python tests/scripts/run_tt_eager.py --dispatch-mode slow else - ./build/test/tt_metal/unit_tests_fast_dispatch - TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue --gtest_filter=MultiCommandQueueSingleDeviceFixture.* + TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter=MultiCommandQueue*Fixture.* # Enable this on BH after #14613 if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then - TT_METAL_GTEST_ETH_DISPATCH=1 ./build/test/tt_metal/unit_tests_fast_dispatch + TT_METAL_GTEST_ETH_DISPATCH=1 ./build/test/tt_metal/unit_tests_dispatch fi env python tests/scripts/run_tt_eager.py --dispatch-mode fast env python tests/scripts/run_tt_metal.py --dispatch-mode fast diff --git a/tests/scripts/run_testpoint_perprocess.py b/tests/scripts/run_testpoint_perprocess.py index 8a2feb156bb..3d2c4a88e9d 100755 --- a/tests/scripts/run_testpoint_perprocess.py +++ b/tests/scripts/run_testpoint_perprocess.py @@ -13,7 +13,7 @@ DEBUG = False TT_METAL_HOME = os.environ["TT_METAL_HOME"] -DEFAULT_GTEST = f"{TT_METAL_HOME}/build/test/tt_metal/unit_tests" +DEFAULT_GTEST = f"{TT_METAL_HOME}/build/test/tt_metal/unit_tests_api" def extract_list_of_test_points(args: argparse.Namespace): diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 517503b2646..6662f2f7b2c 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -76,7 +76,7 @@ run_frequent_api_pipeline_tests() { local dispatch_mode=$3 if [[ $dispatch_mode == "slow" ]]; then - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_frequent + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter=DispatchStress.TensixRunManyTimes echo "Running Python API unit tests in SD for frequent..." ./tests/scripts/run_python_api_unit_tests.sh fi diff --git a/tests/scripts/run_tools_tests.sh b/tests/scripts/run_tools_tests.sh index d86be0f8c0e..7283788336c 100755 --- a/tests/scripts/run_tools_tests.sh +++ b/tests/scripts/run_tools_tests.sh @@ -12,7 +12,7 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then echo "Running watcher dump tool tests..." # Run a test that populates basic fields but not watcher fields - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*PrintHanging + ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*PrintHanging # Run dump tool w/ minimum data - no error expected. ./build/tools/watcher_dump -d=0 -w -c @@ -22,7 +22,7 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then echo "Watcher dump minimal test - Pass" # Now run with all watcher features, expect it to throw. - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*WatcherAssertBrisc + ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*WatcherAssertBrisc ./build/tools/watcher_dump -d=0 -w &> tmp.log || { echo "Above failure is expected."; } # Verify the error we expect showed up in the program output. @@ -30,7 +30,7 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then echo "Watcher dump all data test - Pass" # Check that stack dumping is working - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*TestWatcherRingBufferBrisc + ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*TestWatcherRingBufferBrisc ./build/tools/watcher_dump -d=0 -w grep "brisc highest stack usage:" generated/watcher/watcher.log > /dev/null || { echo "Error: couldn't find stack usage in watcher log after dump." ; exit 1; } echo "Watcher stack usage test - Pass" diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index 6e89ceff603..6b33b853a07 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -8,13 +8,13 @@ run_t3000_ttmetal_tests() { echo "LOG_METAL: Running run_t3000_ttmetal_tests" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" ; fail+=$? - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" ; fail+=$? - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" ; fail+=$? - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" ; fail+=$? - TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" ; fail+=$? - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*" ; fail+=$? - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_eth --gtest_filter="DeviceFixture.ActiveEthKernelsDirectSendAllConnectedChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_eth --gtest_filter="DeviceFixture.ActiveEthKernelsSendInterleavedBufferAllConnectedChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_eth --gtest_filter="DeviceFixture.ActiveEthKernelsDirectRingGatherAllChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_eth --gtest_filter="DeviceFixture.ActiveEthKernelsInterleavedRingGatherAllChips" ; fail+=$? + TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueSingleCard*Fixture.*" ; fail+=$? + ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueMultiDevice*Fixture.*" ; fail+=$? + ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$? # Record the end time end_time=$(date +%s) diff --git a/tests/scripts/tg/run_tg_unit_tests.sh b/tests/scripts/tg/run_tg_unit_tests.sh index 52ad5748558..669f6383b3a 100755 --- a/tests/scripts/tg/run_tg_unit_tests.sh +++ b/tests/scripts/tg/run_tg_unit_tests.sh @@ -5,11 +5,11 @@ run_tg_tests() { echo "LOG_METAL: running run_tg_unit_tests" - TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" + TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueSingleCard*Fixture.*" ./build/test/ttnn/galaxy_unit_tests_ttnn - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_galaxy --gtest_filter="GalaxyFixture.*:TGFixture.*" - ./build/test/tt_metal/unit_tests_galaxy --gtest_filter="GalaxyFixture.*:TGFixture.*" - TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue --gtest_filter="MultiCommandQueueMultiDeviceFixture.*" + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGFixture.*" + ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGFixture.*" + TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="MultiCommandQueueMultiDevice*Fixture.*" } diff --git a/tests/scripts/tgg/run_tgg_unit_tests.sh b/tests/scripts/tgg/run_tgg_unit_tests.sh index 08f8f08c421..0eb73d5e823 100755 --- a/tests/scripts/tgg/run_tgg_unit_tests.sh +++ b/tests/scripts/tgg/run_tgg_unit_tests.sh @@ -5,10 +5,10 @@ run_tgg_tests() { echo "LOG_METAL: running run_tgg_unit_tests" - TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" + TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueSingleCard*Fixture.*" ./build/test/ttnn/galaxy_unit_tests_ttnn - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_galaxy --gtest_filter="GalaxyFixture.*:TGGFixture.*" - ./build/test/tt_metal/unit_tests_galaxy --gtest_filter="GalaxyFixture.*:TGGFixture.*" + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGGFixture.*" + ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGGFixture.*" pytest -s tests/ttnn/distributed/test_mesh_device_TGG.py } diff --git a/tests/tt_eager/tensors/test_async_tensor_apis.cpp b/tests/tt_eager/tensors/test_async_tensor_apis.cpp index 0418df6b535..95a47a7f382 100644 --- a/tests/tt_eager/tensors/test_async_tensor_apis.cpp +++ b/tests/tt_eager/tensors/test_async_tensor_apis.cpp @@ -14,7 +14,7 @@ #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/tensor_impl.hpp" #include "ttnn/tensor/types.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp" #include "tt_metal/host_api.hpp" #include "ttnn/operations/numpy/functions.hpp" @@ -37,7 +37,7 @@ uint32_t get_device_buffer_address(const Tensor& tensor) { } } -TEST_F(CommonFixture, TestTensorOwnershipSanity) { +TEST_F(DispatchFixture, TestTensorOwnershipSanity) { // Sanity test tensor read, write and update paths with synchronous // Ensure that tensor data is copied and owned as expected Device* device = this->devices_[0]; @@ -112,7 +112,7 @@ TEST_F(CommonFixture, TestTensorOwnershipSanity) { EXPECT_EQ(readback_tensor.get_shape(), ttnn::Shape(tt::tt_metal::LegacyShape({1, 1, 32, 128}))); } -TEST_F(CommonFixture, TestAsyncEltwiseBinary) { +TEST_F(DispatchFixture, TestAsyncEltwiseBinary) { Device* device = this->devices_[0]; device->enable_async(true); // Populate these in first loop and verify that deallocation worked - addresses should be identical across loops @@ -169,7 +169,7 @@ TEST_F(CommonFixture, TestAsyncEltwiseBinary) { Tensor tensor_identity_copy_function(const Tensor& tensor) { return tensor; } -TEST_F(CommonFixture, TestAsyncRefCountManager) { +TEST_F(DispatchFixture, TestAsyncRefCountManager) { Device* device = this->devices_[0]; device->enable_async(true); @@ -226,7 +226,7 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) { device->enable_async(false); } -TEST_F(CommonFixture, TestTensorAsyncDataMovement) { +TEST_F(DispatchFixture, TestTensorAsyncDataMovement) { // Test 2 data paths here (resembles async mode): // 1. Main -> Worker: Create a tensor in the main thread. Ensure that it is accessible in the worker thread even // after its destroyed diff --git a/tests/tt_metal/tt_metal/CMakeLists.txt b/tests/tt_metal/tt_metal/CMakeLists.txt index 936c98f4f2d..54ede4a02c5 100644 --- a/tests/tt_metal/tt_metal/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/CMakeLists.txt @@ -35,9 +35,7 @@ set(TT_METAL_TESTS_SRCS test_core_range_set.cpp test_compile_sets_kernel_binaries.cpp test_compile_program.cpp - test_kernel_path_env_var.cpp test_clean_init.cpp - test_create_kernel_from_string.cpp ) foreach(TEST_SRC ${TT_METAL_TESTS_SRCS}) @@ -62,21 +60,27 @@ foreach(TEST_SRC ${TT_METAL_TESTS_SRCS}) list(APPEND METAL_TEST_TARGETS ${TEST}) endforeach() -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unit_tests_common) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unit_tests) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unit_tests_fast_dispatch) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unit_tests_fast_dispatch_single_chip_multi_queue) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unit_tests_frequent) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/api) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/debug_tools) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/device) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/eth) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/integration) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/llk) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/perf_microbenchmark) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/stl) add_custom_target( metal_tests DEPENDS ${METAL_TEST_TARGETS} - unit_tests - unit_tests_fast_dispatch - unit_tests_fast_dispatch_single_chip_multi_queue - unit_tests_frequent metal_perf_microbenchmark_tests - unit_tests_galaxy + unit_tests_api + unit_tests_debug_tools + unit_tests_device + unit_tests_dispatch + unit_tests_eth + unit_tests_integration + unit_tests_llk + unit_tests_stl ) diff --git a/tests/tt_metal/tt_metal/README.md b/tests/tt_metal/tt_metal/README.md new file mode 100644 index 00000000000..00336f8653b --- /dev/null +++ b/tests/tt_metal/tt_metal/README.md @@ -0,0 +1,101 @@ +In order to keep our test suite clean, organized and searchable, please follow the guidelines provided below when adding new tests, modifying existing tests or deleting outdated tests. + + + +Table of Contents +================= + +- [Table of Contents](#table-of-contents) + - [Test Naming](#test-naming) + - [Test Organization](#test-organization) + - [Fixture Naming](#fixture-naming) + - [Fixture Organization](#fixture-organization) + - [File Naming](#file-naming) + - [File Organization](#fixture-organization) + - [api/](#api) + - [debug_tools/](#debug_tools) + - [device/](#device) + - [dispatch/](#dispatch) + - [eth/](#eth) + - [integration/](#integration) + - [llk/](#llk) + - [stl/](#stl) + - [test_kernels/](#test_kernels) + - [common/](#common) + + + + + +## Test Naming +Prefix test names with the core type(s) that the test is using: + - If it's using Tensix cores, prefix it with `Tensix` + - If it's using active ethernet cores, prefix it with `ActiveEth` + - If it's using idle ethernet cores, prefix it with `IdleEth` + - If it's using both active and idle ethernet cores, prefix it with `Eth` + - If it's using multiple core types, prefix it with each core type, eg. `TensixActiveEth`, `TensixIdleEth`, `TensixEth`, etc. + - If it isn't using any core type, don't prefix it with anything + +## Test Organization +Every test should belong to either a test suite or a test fixture. Use the TEST macro for tests in test suites and the TEST_F or TEST_P macros for tests in test fixtures. + +Test suites are ideal for grouping related tests that don’t require shared code. Test fixtures are better suited for related tests that need shared code, which can be defined in the fixture. + +Keep related tests grouped together to make it easier to understand the overall test coverage. + +## Fixture Naming +All fixture names should end in `Fixture`. + +## Fixture Organization +Before creating a new fixture, check if an existing fixture meets your needs. If you need to create a new fixture, consider subclassing an existing fixture to avoid duplicating functionality already provided by another fixture. + +## File Naming +File names should include specific prefixes or suffixes based on their content: + - Files that contain fixtures should have their names end with `_fixture` + - Files that contain helper functions and/or test utilities should have their names end with `_test_utils` + - Files that contain tests should have their names start with `test_` + +## File Organization +Place test utility files and fixture files as close as possible to the files that rely on them. For example, if you have a test file `test_A.cpp` in `tests/tt_metal/tt_metal/dispatch/dispatch_buffer/` and another test file `test_B.cpp` in `tests/tt_metal/tt_metal/dispatch/dispatch_program/`, and both need to use a fixture file `C_fixture.hpp`, it is logical to place `C_fixture.hpp` in `tests/tt_metal/tt_metal/dispatch/`. This ensures the fixture is easily accessible to the relevant test files while avoiding unnecessary clutter in a more generic directory like `tests/tt_metal/tt_metal/common/`. + +Tests using Google Test should be placed in one of the directories listed below that best aligns with their purpose. If multiple directories seem suitable, use your best judgment to select the most appropriate one. + +__Important note: only tests that use Google Test should be placed in the following directories.__ + +### `api/` + - Contains tests that explicitly test `tt-metal`'s API + - Contains tests that read from and/or write to the device + +### `debug_tools/` + - Contains tests for DPrint and Watcher + +### `device/` + - Contains tests for device initialization and teardown + - Contains tests that check device-specific properties + +### `dispatch/` + - Contains tests that explicitly test for properties relating to dispatch + - Contains both slow dispatch and fast dispatch tests + +### `eth/` + - Contains tests that check ethernet communication between multiple devices + - Contains tests that explicitly test ethernet properties on a single device + +### `integration/` + - Contains tests for real-world use cases, eg. matmul, etc + +### `llk/` + - Contains tests for compute Low-Level Kernel (LLK) API + - Tests don't cover individual compute LLK calls, but cover testing LLK API calls as these are used in compute kernels + +### `stl/` + - Contains tests which test custom data structures and algorithms used in `tt-metal` + - None of the tests in this directory should run on the device + +The following directories should be reserved for files that support testing but should not contain actual tests themselves. + +### `test_kernels/` + - Contains kernels that are used in tests + +### `common/` + - Contains test fixtures and utilities shared across multiple directories listed above diff --git a/tests/tt_metal/tt_metal/api/CMakeLists.txt b/tests/tt_metal/tt_metal/api/CMakeLists.txt new file mode 100644 index 00000000000..fc95afeb92e --- /dev/null +++ b/tests/tt_metal/tt_metal/api/CMakeLists.txt @@ -0,0 +1,60 @@ +set(UNIT_TESTS_API_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/allocator/test_free_list_allocator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/allocator/test_l1_banking_allocator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_allocation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_creation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_non_blocking.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_adjacent.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_contains.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_intersects.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_iterator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_merge.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_construct.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_contains.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_intersects.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_merge.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_banked.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_bit_utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_CommandQueue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_direct.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_dram_to_l1_multicast.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_dram.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_global_semaphores.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_kernel_creation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_noc.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_runtime_args.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_semaphores.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sharded_l1_buffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_simple_dram_buffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_simple_l1_buffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_soc_descriptor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_tilize_untilize.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_worker_config_buffer.cpp +) + +add_executable(unit_tests_api ${UNIT_TESTS_API_SRC}) +TT_ENABLE_UNITY_BUILD(unit_tests_api) + +target_link_libraries( + unit_tests_api + PRIVATE + test_metal_common_libs + Boost::smart_ptr +) +target_include_directories( + unit_tests_api + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) +set_target_properties( + unit_tests_api + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) diff --git a/tests/tt_metal/tt_metal/unit_tests/allocator/test_free_list_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_free_list_allocator.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests/allocator/test_free_list_allocator.cpp rename to tests/tt_metal/tt_metal/api/allocator/test_free_list_allocator.cpp index d7b5ffdf52f..e16965f0d31 100644 --- a/tests/tt_metal/tt_metal/unit_tests/allocator/test_free_list_allocator.cpp +++ b/tests/tt_metal/tt_metal/api/allocator/test_free_list_allocator.cpp @@ -4,13 +4,12 @@ #include -#include "basic_fixture.hpp" -#include "tt_metal/host_api.hpp" +#include "host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/impl/allocator/algorithms/free_list.hpp" // TODO: Add a variant with randomized allocations and deallocations -TEST_F(BasicFixture, TestDirectedSeriesOfAllocDealloc) { +TEST(FreeListAllocator, TestDirectedSeriesOfAllocDealloc) { constexpr uint32_t max_size_bytes = 1024; constexpr uint32_t min_allocation_size_bytes = 32; constexpr uint32_t alignment = 32; @@ -132,7 +131,7 @@ TEST_F(BasicFixture, TestDirectedSeriesOfAllocDealloc) { EXPECT_EQ(addr_20.value(), 64); } -TEST_F(BasicFixture, TestResizeAllocator) { +TEST(FreeListAllocator, TestResizeAllocator) { constexpr uint32_t max_size_bytes = 1024; constexpr uint32_t min_allocation_size_bytes = 32; constexpr uint32_t alignment = 32; @@ -184,7 +183,7 @@ TEST_F(BasicFixture, TestResizeAllocator) { EXPECT_EQ(addr_6.value(), 32); } -TEST_F(BasicFixture, TestDirectedResizeAllocator) { +TEST(FreeListAllocator, TestDirectedResizeAllocator) { constexpr uint32_t max_size_bytes = 1024; constexpr uint32_t min_allocation_size_bytes = 32; constexpr uint32_t alignment = 32; diff --git a/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp similarity index 92% rename from tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp rename to tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp index 27134acd303..738e79c3fd4 100644 --- a/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp +++ b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp @@ -4,9 +4,7 @@ #include -#include "basic_fixture.hpp" #include "device_fixture.hpp" -#include "tt_metal/common/core_descriptor.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" @@ -26,7 +24,7 @@ uint64_t get_alloc_limit(const tt::tt_metal::Device *device) { } // namespace unit_tests::test_l1_banking_allocator -TEST_F(DeviceSingleCardFixture, TestL1BuffersAllocatedTopDown) { +TEST_F(DeviceSingleCardBufferFixture, TestL1BuffersAllocatedTopDown) { std::vector alloc_sizes = {32 * 1024, 64 * 1024, 128 * 1024}; size_t total_size_bytes = 0; @@ -50,7 +48,7 @@ TEST_F(DeviceSingleCardFixture, TestL1BuffersAllocatedTopDown) { buffers.clear(); } -TEST_F(DeviceSingleCardFixture, TestL1BuffersDoNotGrowBeyondBankSize) { +TEST_F(DeviceSingleCardBufferFixture, TestL1BuffersDoNotGrowBeyondBankSize) { uint64_t alloc_limit = unit_tests::test_l1_banking_allocator::get_alloc_limit(this->device_); tt::tt_metal::InterleavedBufferConfig l1_config{ diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.cpp b/tests/tt_metal/tt_metal/api/buffer_test_utils.hpp similarity index 76% rename from tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.cpp rename to tests/tt_metal/tt_metal/api/buffer_test_utils.hpp index 0ffef3f73dc..a883090e09d 100644 --- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.cpp +++ b/tests/tt_metal/tt_metal/api/buffer_test_utils.hpp @@ -2,24 +2,25 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "test_buffer_utils.hpp" -#include "tt_metal/detail/tt_metal.hpp" +#pragma once + +#include "host_api.hpp" namespace tt::test::buffer::detail { -void writeL1Backdoor(tt::tt_metal::Device* device, CoreCoord coord, uint32_t address, std::vector& data) { +inline void writeL1Backdoor(tt::tt_metal::Device* device, CoreCoord coord, uint32_t address, std::vector& data) { tt::log_info("{} -- coord={} address={}", __FUNCTION__, coord.str(), address); tt_metal::detail::WriteToDeviceL1(device, coord, address, data); } -void readL1Backdoor( +inline void readL1Backdoor( tt::tt_metal::Device* device, CoreCoord coord, uint32_t address, uint32_t byte_size, std::vector& data) { tt::log_info("{} -- coord={} address={} byte_size={}", __FUNCTION__, coord.str(), address, byte_size); tt_metal::detail::ReadFromDeviceL1(device, coord, address, byte_size, data); } -void writeDramBackdoor(tt::tt_metal::Device* device, uint32_t channel, uint32_t address, std::vector& data) { +inline void writeDramBackdoor(tt::tt_metal::Device* device, uint32_t channel, uint32_t address, std::vector& data) { tt::log_info("{} -- channel={} address={}", __FUNCTION__, channel, address); tt_metal::detail::WriteToDeviceDRAMChannel(device, channel, address, data); } -void readDramBackdoor( +inline void readDramBackdoor( tt::tt_metal::Device* device, uint32_t channel, uint32_t address, uint32_t byte_size, std::vector& data) { tt::log_info("{} -- channel={} address={} byte_size={}", __FUNCTION__, channel, address, byte_size); tt_metal::detail::ReadFromDeviceDRAMChannel(device, channel, address, byte_size, data); diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp b/tests/tt_metal/tt_metal/api/circular_buffer/circular_buffer_test_utils.hpp similarity index 100% rename from tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp rename to tests/tt_metal/tt_metal/api/circular_buffer/circular_buffer_test_utils.hpp diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp rename to tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp index 1df5ec9cdfd..04182323c6d 100644 --- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp +++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp @@ -6,7 +6,6 @@ #include "gtest/gtest.h" #include "circular_buffer_test_utils.hpp" #include "tt_metal/host_api.hpp" -#include "tt_metal/impl/buffers/circular_buffer.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "common/bfloat16.hpp" @@ -39,7 +38,7 @@ void validate_cb_address(Program &program, Device *device, const CoreRangeSet &c } } -TEST_F(DeviceFixture, TestCircularBuffersSequentiallyPlaced) { +TEST_F(DeviceFixture, TensixTestCircularBuffersSequentiallyPlaced) { for (unsigned int id = 0; id < num_devices_; id++) { Program program; CBConfig cb_config; @@ -66,7 +65,7 @@ TEST_F(DeviceFixture, TestCircularBuffersSequentiallyPlaced) { } } -TEST_F(DeviceFixture, TestCircularBufferSequentialAcrossAllCores) { +TEST_F(DeviceFixture, TensixTestCircularBufferSequentialAcrossAllCores) { for (unsigned int id = 0; id < num_devices_; id++) { Program program; CBConfig cb_config; @@ -108,7 +107,7 @@ TEST_F(DeviceFixture, TestCircularBufferSequentialAcrossAllCores) { } } -TEST_F(DeviceFixture, TestValidCircularBufferAddress) { +TEST_F(DeviceFixture, TensixTestValidCircularBufferAddress) { for (unsigned int id = 0; id < num_devices_; id++) { Program program; CBConfig cb_config; @@ -149,7 +148,7 @@ TEST_F(DeviceFixture, TestValidCircularBufferAddress) { } } -TEST_F(DeviceFixture, TestCircularBuffersAndL1BuffersCollision) { +TEST_F(DeviceFixture, TensixTestCircularBuffersAndL1BuffersCollision) { for (unsigned int id = 0; id < num_devices_; id++) { Program program; uint32_t page_size = TileSize(tt::DataFormat::Float16_b); @@ -181,7 +180,7 @@ TEST_F(DeviceFixture, TestCircularBuffersAndL1BuffersCollision) { } } -TEST_F(DeviceFixture, TestValidUpdateCircularBufferSize) { +TEST_F(DeviceFixture, TensixTestValidUpdateCircularBufferSize) { for (unsigned int id = 0; id < num_devices_; id++) { Program program; CBConfig cb_config; @@ -215,7 +214,7 @@ TEST_F(DeviceFixture, TestValidUpdateCircularBufferSize) { } } -TEST_F(DeviceFixture, TestInvalidUpdateCircularBufferSize) { +TEST_F(DeviceFixture, TensixTestInvalidUpdateCircularBufferSize) { for (unsigned int id = 0; id < num_devices_; id++) { Program program; CBConfig cb_config; @@ -245,7 +244,7 @@ TEST_F(DeviceFixture, TestInvalidUpdateCircularBufferSize) { } } -TEST_F(DeviceFixture, TestUpdateCircularBufferAddress) { +TEST_F(DeviceFixture, TensixTestUpdateCircularBufferAddress) { for (unsigned int id = 0; id < num_devices_; id++) { Program program; CBConfig cb_config; @@ -284,7 +283,7 @@ TEST_F(DeviceFixture, TestUpdateCircularBufferAddress) { } } -TEST_F(DeviceFixture, TestUpdateCircularBufferPageSize) { +TEST_F(DeviceFixture, TensixTestUpdateCircularBufferPageSize) { for (unsigned int id = 0; id < num_devices_; id++) { Device *device = this->devices_.at(id); Program program; @@ -360,7 +359,7 @@ TEST_F(DeviceFixture, TestUpdateCircularBufferPageSize) { } } -TEST_F(DeviceFixture, TestDataCopyWithUpdatedCircularBufferConfig) { +TEST_F(DeviceFixture, TensixTestDataCopyWithUpdatedCircularBufferConfig) { for (unsigned int id = 0; id < num_devices_; id++) { Program program; CoreCoord core(0, 0); diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp rename to tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp index 984dba24740..5ee907c065d 100644 --- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp +++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp @@ -46,7 +46,7 @@ bool test_cb_config_written_to_core(Program &program, Device *device, const Core return pass; } -TEST_F(DeviceFixture, TestCreateCircularBufferAtValidIndices) { +TEST_F(DeviceFixture, TensixTestCreateCircularBufferAtValidIndices) { CBConfig cb_config; CoreRange cr({0, 0}, {0, 1}); @@ -95,7 +95,7 @@ TEST_F(DeviceFixture, TestCreateCircularBufferWithMismatchingConfig) { EXPECT_ANY_THROW(CircularBufferConfig(cb_config.page_size, {{0, cb_config.data_format}}).set_page_size(1, cb_config.page_size)); } -TEST_F(DeviceFixture, TestCreateCircularBufferAtOverlappingIndex) { +TEST_F(DeviceFixture, TensixTestCreateCircularBufferAtOverlappingIndex) { Program program; CBConfig cb_config; diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_non_blocking.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_non_blocking.cpp rename to tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp index c10b6c6e0db..0a4b0849ca4 100644 --- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_non_blocking.cpp +++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp @@ -61,7 +61,7 @@ std::vector generate_rt_args(uint32_t master_semaphore, uint32_t slave return rt_args; } -TEST_F(DeviceFixture, TestCircularBufferNonBlockingAPIs) { +TEST_F(DeviceFixture, TensixTestCircularBufferNonBlockingAPIs) { Program program; Device *device = devices_.at(0); diff --git a/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp b/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp new file mode 100644 index 00000000000..bf1f6403018 --- /dev/null +++ b/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "host_api.hpp" +#include "logger.hpp" + +using namespace tt; + +class CompileProgramWithKernelPathEnvVarFixture : public ::testing::Test { + protected: + void SetUp() override { + if (!this->are_preconditions_satisfied()) { + GTEST_SKIP(); + } + + const chip_id_t device_id = 0; + this->device_ = CreateDevice(device_id); + this->program_ = CreateProgram(); + } + + void TearDown() override { + if (!IsSkipped()) { + CloseDevice(this->device_); + } + } + + void create_kernel(const string &kernel_file) { + CoreCoord core(0, 0); + tt_metal::CreateKernel( + this->program_, + kernel_file, + core, + tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + } + + void setup_kernel_dir(const string &orig_kernel_file, const string &new_kernel_file) { + const string &kernel_dir = llrt::OptionsG.get_kernel_dir(); + const std::filesystem::path &kernel_file_path_under_kernel_dir(kernel_dir + new_kernel_file); + const std::filesystem::path &dirs_under_kernel_dir = kernel_file_path_under_kernel_dir.parent_path(); + std::filesystem::create_directories(dirs_under_kernel_dir); + + const string &metal_root = llrt::OptionsG.get_root_dir(); + const std::filesystem::path &kernel_file_path_under_metal_root(metal_root + orig_kernel_file); + std::filesystem::copy(kernel_file_path_under_metal_root, kernel_file_path_under_kernel_dir); + } + + void cleanup_kernel_dir() { + const string &kernel_dir = llrt::OptionsG.get_kernel_dir(); + for (const std::filesystem::directory_entry &entry : std::filesystem::directory_iterator(kernel_dir)) { + std::filesystem::remove_all(entry); + } + } + + Device *device_; + Program program_; + + private: + bool are_preconditions_satisfied() { + return this->are_env_vars_set() && this->is_kernel_dir_valid(); + } + + bool are_env_vars_set() { + bool are_set = true; + if (!llrt::OptionsG.is_root_dir_specified()) { + log_info(LogTest, "Skipping test: TT_METAL_HOME must be set"); + are_set = false; + } + if (!llrt::OptionsG.is_kernel_dir_specified()) { + log_info(LogTest, "Skipping test: TT_METAL_KERNEL_PATH must be set"); + are_set = false; + } + return are_set; + } + + bool is_kernel_dir_valid() { + bool is_valid = true; + const string &kernel_dir = llrt::OptionsG.get_kernel_dir(); + if (!this->does_path_exist(kernel_dir) || !this->is_path_a_directory(kernel_dir) || + !this->is_dir_empty(kernel_dir)) { + log_info(LogTest, "Skipping test: TT_METAL_KERNEL_PATH must be an existing, empty directory"); + is_valid = false; + } + return is_valid; + } + + bool does_path_exist(const string &path) { + const std::filesystem::path &file_path(path); + return std::filesystem::exists(file_path); + } + + bool is_path_a_directory(const string &path) { + TT_FATAL(this->does_path_exist(path), "{} does not exist", path); + const std::filesystem::path &file_path(path); + return std::filesystem::is_directory(file_path); + } + + bool is_dir_empty(const string &path) { + TT_FATAL(this->does_path_exist(path), "{} does not exist", path); + TT_FATAL(this->is_path_a_directory(path), "{} is not a directory", path); + const std::filesystem::path &file_path(path); + return std::filesystem::is_empty(file_path); + } +}; diff --git a/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp b/tests/tt_metal/tt_metal/api/core_coord/core_coord_fixture.hpp similarity index 89% rename from tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp rename to tests/tt_metal/tt_metal/api/core_coord/core_coord_fixture.hpp index 596c5f44a73..5b2148498df 100644 --- a/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp +++ b/tests/tt_metal/tt_metal/api/core_coord/core_coord_fixture.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -6,9 +6,8 @@ #include "gtest/gtest.h" #include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -class CoreCoordHarness : public ::testing::Test { +class CoreCoordFixture : public ::testing::Test { protected: CoreRange cr1 = CoreRange({0, 0}, {1, 1}); CoreRange cr2 = CoreRange({3, 3}, {5, 4}); diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_construct.cpp similarity index 86% rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_construct.cpp index 04cdc9f15c2..a1aedf9403e 100644 --- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp +++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_construct.cpp @@ -8,7 +8,7 @@ namespace basic_tests::CoreRangeSet{ -TEST_F(CoreCoordHarness, TestCoreRangeSetValidConstruct) +TEST_F(CoreCoordFixture, TestCoreRangeSetValidConstruct) { EXPECT_NO_THROW(::CoreRangeSet(std::vector{this->sc1, this->cr2})); EXPECT_NO_THROW(::CoreRangeSet(std::vector{this->cr1, this->cr2})); @@ -17,7 +17,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeSetValidConstruct) EXPECT_EQ(valid_ranges.ranges().size(), 2); } -TEST_F(CoreCoordHarness, TestCoreRangeSetInvalidConstruct) +TEST_F(CoreCoordFixture, TestCoreRangeSetInvalidConstruct) { ::CoreRange overlapping_range({1, 2}, {3, 3}); EXPECT_ANY_THROW(::CoreRangeSet(std::vector{this->cr1, this->cr2, overlapping_range})); diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_contains.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_contains.cpp similarity index 96% rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_contains.cpp rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_contains.cpp index a76802a2e35..6a3eefbf03f 100644 --- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_contains.cpp +++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_contains.cpp @@ -10,7 +10,7 @@ namespace basic_tests::CoreRangeSet { -TEST_F(CoreCoordHarness, TestCoreRangeSetContains) { +TEST_F(CoreCoordFixture, TestCoreRangeSetContains) { // Contains CoreCoord EXPECT_TRUE(::CoreRangeSet(this->cr1).contains(this->cr5.start_coord)); EXPECT_TRUE(::CoreRangeSet(this->cr5).contains(this->cr1.end_coord)); @@ -33,7 +33,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeSetContains) { EXPECT_TRUE(::CoreRangeSet(this->cr12).contains(::CoreRangeSet(std::vector{this->sc6, this->cr11}))); } -TEST_F(CoreCoordHarness, TestCoreRangeSetNotContains) { +TEST_F(CoreCoordFixture, TestCoreRangeSetNotContains) { // Not Contains CoreCoord EXPECT_FALSE(::CoreRangeSet(this->cr1).contains(this->cr2.start_coord)); EXPECT_FALSE( diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_intersects.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_intersects.cpp similarity index 96% rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_intersects.cpp rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_intersects.cpp index 3815de620e2..fb1f406412f 100644 --- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_intersects.cpp +++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_intersects.cpp @@ -10,7 +10,7 @@ namespace basic_tests::CoreRangeSet { -TEST_F(CoreCoordHarness, TestCoreRangeSetIntersects) { +TEST_F(CoreCoordFixture, TestCoreRangeSetIntersects) { // Intersects CoreCoord EXPECT_TRUE(::CoreRangeSet(this->cr1).intersects(this->cr5.start_coord)); EXPECT_TRUE(::CoreRangeSet(this->cr5).intersects(this->cr1.end_coord)); @@ -32,7 +32,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeSetIntersects) { EXPECT_TRUE(::CoreRangeSet(this->sc2).intersects(::CoreRangeSet(std::vector{this->cr7, this->cr1}))); } -TEST_F(CoreCoordHarness, TestCoreRangeSetNotIntersects) { +TEST_F(CoreCoordFixture, TestCoreRangeSetNotIntersects) { // Not Intersects CoreCoord EXPECT_FALSE(::CoreRangeSet(this->cr1).intersects(this->cr2.start_coord)); EXPECT_FALSE( diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_merge.cpp similarity index 73% rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_merge.cpp index 778e5083538..2adfb440ba2 100644 --- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp +++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_merge.cpp @@ -11,43 +11,43 @@ namespace basic_tests::CoreRangeSet { -TEST_F(CoreCoordHarness, TestCoreRangeSetMergeNoSolution) { +TEST_F(CoreCoordFixture, TestCoreRangeSetMergeNoSolution) { EXPECT_EQ(::CoreRangeSet(sc1).merge(std::set{sc3}), ::CoreRangeSet(std::set{sc1, sc3})); EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr2}), ::CoreRangeSet(std::set{cr1, cr2})); EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr1, cr2}), ::CoreRangeSet(std::set{cr1, cr2})); EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr2}).merge(std::set{cr3}), ::CoreRangeSet(std::set{cr1, cr2, cr3})); } -TEST_F(CoreCoordHarness, TestCoreRangeSetMergeCoreCoord) { +TEST_F(CoreCoordFixture, TestCoreRangeSetMergeCoreCoord) { ::CoreRangeSet empty_crs; EXPECT_EQ(empty_crs.merge(std::set{this->sc1}).ranges().size(), 1); EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{sc3, sc4}), ::CoreRangeSet(std::set{cr16})); EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{sc3}).merge(std::set{sc4}), ::CoreRangeSet(std::set{cr16})); - CoreRange rect({0, 0}, {4, 2}); - std::set rect_pts; + ::CoreRange rect({0, 0}, {4, 2}); + std::set<::CoreRange> rect_pts; for (unsigned y = rect.start_coord.y; y <= rect.end_coord.y; y++) { for (unsigned x = rect.start_coord.x; x <= rect.end_coord.x; x++) { - rect_pts.insert(CoreRange({x, y}, {x, y})); + rect_pts.insert(::CoreRange({x, y}, {x, y})); } } EXPECT_EQ(empty_crs.merge(rect_pts), ::CoreRangeSet(std::set{rect})); // upside-down "T" - rect_pts.insert({CoreRange({2, 0}, {3, 5})}); - EXPECT_EQ(empty_crs.merge(rect_pts), ::CoreRangeSet(std::set{rect, CoreRange({2, 3}, {3, 5})})); + rect_pts.insert({::CoreRange({2, 0}, {3, 5})}); + EXPECT_EQ(empty_crs.merge(rect_pts), ::CoreRangeSet(std::set{rect, ::CoreRange({2, 3}, {3, 5})})); // "H", sub-optimal currently, should be reduced down to 3 CRs instead of 5 EXPECT_EQ( - empty_crs.merge(std::vector{CoreRange{{0, 0}, {1, 5}}, CoreRange{{3, 0}, {4, 5}}, CoreRange{{0, 2}, {4, 3}}}), + empty_crs.merge(std::vector{::CoreRange{{0, 0}, {1, 5}}, ::CoreRange{{3, 0}, {4, 5}}, ::CoreRange{{0, 2}, {4, 3}}}), ::CoreRangeSet(std::set{ - CoreRange{{0, 0}, {1, 1}}, - CoreRange{{0, 2}, {4, 3}}, - CoreRange{{0, 4}, {1, 5}}, - CoreRange{{3, 0}, {4, 1}}, - CoreRange{{3, 4}, {4, 5}}})); + ::CoreRange{{0, 0}, {1, 1}}, + ::CoreRange{{0, 2}, {4, 3}}, + ::CoreRange{{0, 4}, {1, 5}}, + ::CoreRange{{3, 0}, {4, 1}}, + ::CoreRange{{3, 4}, {4, 5}}})); } -TEST_F(CoreCoordHarness, TestCoreRangeSetMergeCoreRange) { +TEST_F(CoreCoordFixture, TestCoreRangeSetMergeCoreRange) { EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr1}), ::CoreRangeSet(std::set{cr1})); EXPECT_EQ(::CoreRangeSet(cr7).merge(std::set{cr6}).merge(std::set{cr4}), ::CoreRangeSet(std::set{cr8})); EXPECT_EQ( diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_adjacent.cpp similarity index 91% rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_adjacent.cpp index f08976402d6..f9b386a71ea 100644 --- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp +++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_adjacent.cpp @@ -10,7 +10,7 @@ namespace basic_tests::CoreRange{ -TEST_F(CoreCoordHarness, TestCoreRangeAdjacent) +TEST_F(CoreCoordFixture, TestCoreRangeAdjacent) { EXPECT_TRUE ( this->cr1.adjacent(this->cr9) ); EXPECT_TRUE ( this->cr9.adjacent(this->cr1) ); @@ -23,7 +23,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeAdjacent) } -TEST_F(CoreCoordHarness, TestCoreRangeNotAdjacent){ +TEST_F(CoreCoordFixture, TestCoreRangeNotAdjacent){ EXPECT_FALSE ( this->cr2.adjacent(this->cr3)); EXPECT_FALSE ( this->cr1.adjacent(this->cr6)); EXPECT_FALSE ( this->cr1.adjacent(this->cr11)); diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_contains.cpp similarity index 94% rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_contains.cpp index a9369eb7445..905b59123fe 100644 --- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp +++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_contains.cpp @@ -10,7 +10,7 @@ namespace basic_tests::CoreRange { -TEST_F(CoreCoordHarness, TestCoreRangeContains) { +TEST_F(CoreCoordFixture, TestCoreRangeContains) { // Contains CoreCoord EXPECT_TRUE(this->cr1.contains(this->sc1.start_coord)); EXPECT_TRUE(this->cr1.contains(this->cr1.start_coord)); @@ -25,7 +25,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeContains) { EXPECT_TRUE(this->cr4.contains(::CoreRangeSet(std::vector{this->cr1, this->cr2, this->cr3}))); } -TEST_F(CoreCoordHarness, TestCoreRangeNotContains) { +TEST_F(CoreCoordFixture, TestCoreRangeNotContains) { // Not Contains CoreCoord EXPECT_FALSE(this->sc1.contains(this->cr1.start_coord)); EXPECT_FALSE(this->sc1.contains(this->sc2.start_coord)); diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_intersects.cpp similarity index 90% rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_intersects.cpp index 29eeeb591b8..409bf123f26 100644 --- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp +++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_intersects.cpp @@ -8,7 +8,7 @@ namespace basic_tests::CoreRange { -TEST_F(CoreCoordHarness, TestCoreRangeIntersects) { +TEST_F(CoreCoordFixture, TestCoreRangeIntersects) { EXPECT_TRUE(this->cr1.intersects(this->cr5)); EXPECT_EQ(this->cr1.intersection(this->cr5).value(), ::CoreRange({1, 0}, {1, 1})); @@ -25,7 +25,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeIntersects) { EXPECT_EQ(this->cr7.intersection(this->cr8).value(), this->cr7); } -TEST_F(CoreCoordHarness, TestCoreRangeNotIntersects) { +TEST_F(CoreCoordFixture, TestCoreRangeNotIntersects) { EXPECT_FALSE(this->cr1.intersects(this->cr2)); EXPECT_FALSE(this->sc1.intersects(this->cr2)); EXPECT_FALSE(this->cr1.intersects(this->cr7)); diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_iterator.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_iterator.cpp index 5729e1a6c4b..7b41528637c 100644 --- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp +++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_iterator.cpp @@ -11,7 +11,7 @@ using std::vector; namespace basic_tests::CoreRange { -TEST_F(CoreCoordHarness, TestCoreRangeIterator) +TEST_F(CoreCoordFixture, TestCoreRangeIterator) { vector cores_in_core_range; diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_merge.cpp similarity index 92% rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_merge.cpp index db8a1b2c7ad..a3da8c8b90d 100644 --- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp +++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_merge.cpp @@ -10,7 +10,7 @@ namespace basic_tests::CoreRange{ -TEST_F(CoreCoordHarness, TestCoreRangeMerge) +TEST_F(CoreCoordFixture, TestCoreRangeMerge) { EXPECT_EQ ( this->sc1.merge(this->sc1).value(), this->sc1 ); EXPECT_EQ ( this->cr4.merge(this->cr5).value(), this->cr6 ); @@ -25,7 +25,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeMerge) } -TEST_F(CoreCoordHarness, TestCoreRangeNotMergeable){ +TEST_F(CoreCoordFixture, TestCoreRangeNotMergeable){ EXPECT_FALSE ( this->cr1.merge(this->cr3)); EXPECT_FALSE ( this->cr2.merge(this->cr3)); EXPECT_FALSE ( this->cr1.merge(this->cr6)); diff --git a/tests/tt_metal/tt_metal/api/test_CommandQueue.cpp b/tests/tt_metal/tt_metal/api/test_CommandQueue.cpp new file mode 100644 index 00000000000..54a015a8146 --- /dev/null +++ b/tests/tt_metal/tt_metal/api/test_CommandQueue.cpp @@ -0,0 +1,151 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "gtest/gtest.h" + +#include "command_queue_fixture.hpp" +#include "tt_metal/host_api.hpp" +#include "tt_metal/common/scoped_timer.hpp" +#include "tt_metal/impl/device/device.hpp" +#include "tt_metal/impl/buffers/circular_buffer.hpp" +#include "tt_metal/test_utils/stimulus.hpp" + +using namespace tt::tt_metal; + +namespace host_tests { + +TEST_F(CommandQueueMultiDeviceFixture, DISABLED_TestAccessCommandQueue) { + for (unsigned int device_id = 0; device_id < num_devices_; device_id++) { + EXPECT_NO_THROW(devices_[device_id]->command_queue()); + } +} + +TEST_F(CommandQueueFixture, TestCannotAccessCommandQueueForClosedDevice) { + EXPECT_NO_THROW(device_->command_queue()); + CloseDevice(device_); + EXPECT_ANY_THROW(device_->command_queue()); +} + +TEST_F(CommandQueueFixture, DISABLED_TensixTestAsyncAssertForDeprecatedAPI) { + auto &command_queue = this->device_->command_queue(); + auto current_mode = CommandQueue::default_mode(); + command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); + Program program; + CoreCoord core = {0, 0}; + uint32_t buf_size = 4096; + uint32_t page_size = 4096; + auto dummy_kernel = CreateKernel( + program, + "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp", + core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + auto src0 = Buffer::create(this->device_, buf_size, page_size, BufferType::DRAM); + std::vector runtime_args = {src0->address()}; + try { + SetRuntimeArgs(program, dummy_kernel, core, runtime_args); + } catch (std::runtime_error &e) { + std::string expected = + "This variant of SetRuntimeArgs can only be called when Asynchronous SW Command Queues are disabled for " + "Fast Dispatch."; + const string error = string(e.what()); + EXPECT_TRUE(error.find(expected) != std::string::npos); + } + command_queue.set_mode(current_mode); +} + +TEST_F(CommandQueueProgramFixture, TensixTestAsyncCommandQueueSanityAndProfile) { + auto& command_queue = this->device_->command_queue(); + auto current_mode = CommandQueue::default_mode(); + command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); + Program program; + + CoreRange cr({0, 0}, {0, 0}); + CoreRangeSet cr_set({cr}); + // Add an NCRISC blank manually, but in compile program, the BRISC blank will be + // added separately + auto dummy_reader_kernel = CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp", + cr_set, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + // Use scoper timer to benchmark time for pushing 2 commands + { + tt::ScopedTimer timer("AsyncCommandQueue"); + EnqueueProgram(command_queue, program, false); + Finish(command_queue); + } + command_queue.set_mode(current_mode); +} + +TEST_F(CommandQueueBufferFixture, DISABLED_TensixTestAsyncCBAllocation) { + // Test asynchronous allocation of buffers and their assignment to CBs + auto& command_queue = this->device_->command_queue(); + auto current_mode = CommandQueue::default_mode(); + command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); + Program program; + + const uint32_t num_pages = 1; + const uint32_t page_size = detail::TileSize(tt::DataFormat::Float16_b); + const tt::DataFormat data_format = tt::DataFormat::Float16_b; + + auto buffer_size = page_size; + tt::tt_metal::InterleavedBufferConfig buff_config{ + .device=this->device_, + .size = buffer_size, + .page_size = buffer_size, + .buffer_type = tt::tt_metal::BufferType::L1 + }; + // Asynchronously allocate an L1 Buffer + auto l1_buffer = CreateBuffer(buff_config); + CoreRange cr({0, 0}, {0, 2}); + CoreRangeSet cr_set({cr}); + std::vector buffer_indices = {16, 24}; + + CircularBufferConfig config1 = CircularBufferConfig(page_size, {{buffer_indices[0], data_format}, {buffer_indices[1], data_format}}, *l1_buffer) + .set_page_size(buffer_indices[0], page_size) + .set_page_size(buffer_indices[1], page_size); + // Asynchronously assign the L1 Buffer to the CB + auto multi_core_cb = CreateCircularBuffer(program, cr_set, config1); + auto cb_ptr = detail::GetCircularBuffer(program, multi_core_cb); + Finish(this->device_->command_queue()); + // Addresses should match + EXPECT_EQ(cb_ptr->address(), l1_buffer->address()); + // Asynchronously allocate a new L1 buffer + auto l1_buffer_2 = CreateBuffer(buff_config); + // Asynchronously update CB address to match new L1 buffer + UpdateDynamicCircularBufferAddress(program, multi_core_cb, *l1_buffer_2); + Finish(this->device_->command_queue()); + // Addresses should match + EXPECT_EQ(cb_ptr->address(), l1_buffer_2->address()); + command_queue.set_mode(current_mode); +} + +TEST_F(CommandQueueMultiDeviceFixture, DISABLED_TestDirectedLoopbackToUniqueHugepage) { + std::unordered_map> golden_data; + + const uint32_t byte_size = 2048 * 16; + const uint64_t address = 0; + + for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) { + std::vector data = + tt::test_utils::generate_uniform_random_vector(0, UINT32_MAX, byte_size / sizeof(uint32_t)); + + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); + tt::Cluster::instance().write_sysmem( + data.data(), data.size() * sizeof(uint32_t), address, mmio_device_id, channel); + + golden_data[device_id] = data; + } + + std::vector readback_data; + readback_data.resize(byte_size / sizeof(uint32_t)); + for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) { + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); + tt::Cluster::instance().read_sysmem(readback_data.data(), byte_size, address, mmio_device_id, channel); + EXPECT_EQ(readback_data, golden_data.at(device_id)); + } +} +} // namespace host_tests diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp b/tests/tt_metal/tt_metal/api/test_banked.cpp similarity index 93% rename from tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp rename to tests/tt_metal/tt_metal/api/test_banked.cpp index 9f1d68e7440..0479cf75db4 100644 --- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp +++ b/tests/tt_metal/tt_metal/api/test_banked.cpp @@ -264,14 +264,14 @@ detail::LaunchProgram(device, program); } // end namespace local_test_functions -TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedL1ReaderOnly) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedL1ReaderOnly) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; EXPECT_TRUE(local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, false)); } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderOnly) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1ReaderOnly) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; TT_FATAL(this->devices_.at(id)->num_banks(BufferType::L1) % 2 == 0, "Error"); @@ -289,7 +289,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderOnly) { } } -TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderOnly) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedDramReaderOnly) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; test_config.input_buffer_type = BufferType::DRAM; @@ -298,7 +298,7 @@ TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderOnly) { } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderOnly) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramReaderOnly) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; TT_FATAL(this->devices_.at(id)->num_banks(BufferType::DRAM) % 2 == 0, "Error"); @@ -318,14 +318,14 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderOnly) { } } -TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedL1WriterOnly) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedL1WriterOnly) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; EXPECT_TRUE(local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, false, true)); } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1WriterOnly) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1WriterOnly) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; TT_FATAL(this->devices_.at(id)->num_banks(BufferType::L1) % 2 == 0, "Error"); @@ -343,7 +343,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1WriterOnly) { } } -TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramWriterOnly) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedDramWriterOnly) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; test_config.input_buffer_type = BufferType::DRAM; @@ -352,7 +352,7 @@ TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramWriterOnly) { } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramWriterOnly) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramWriterOnly) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; TT_FATAL(this->devices_.at(id)->num_banks(BufferType::DRAM) % 2 == 0, "Error"); @@ -372,14 +372,14 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramWriterOnly) { } } -TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedL1ReaderAndWriter) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedL1ReaderAndWriter) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; EXPECT_TRUE(local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true)); } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderAndWriter) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1ReaderAndWriter) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::L1); @@ -397,7 +397,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderAndWriter) { } } -TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderAndWriter) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedDramReaderAndWriter) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; test_config.input_buffer_type = BufferType::DRAM; @@ -406,7 +406,7 @@ TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderAndWriter) { } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderAndWriter) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramReaderAndWriter) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::L1); @@ -426,7 +426,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderAndWriter) { } } -TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderAndL1Writer) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedDramReaderAndL1Writer) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; test_config.input_buffer_type = BufferType::DRAM; @@ -434,7 +434,7 @@ TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderAndL1Writer) { } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderAndL1Writer) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramReaderAndL1Writer) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; test_config.input_buffer_type = BufferType::DRAM; @@ -454,7 +454,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderAndL1Writer) { } } -TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedL1ReaderAndDramWriter) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedL1ReaderAndDramWriter) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; test_config.output_buffer_type = BufferType::DRAM; @@ -462,7 +462,7 @@ TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedL1ReaderAndDramWriter) { } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderAndDramWriter) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1ReaderAndDramWriter) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; test_config.output_buffer_type = BufferType::DRAM; @@ -482,7 +482,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderAndDramWriter) { } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderDataCopyL1Writer) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1ReaderDataCopyL1Writer) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::L1); @@ -501,7 +501,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderDataCopyL1Writer) { } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderDataCopyDramWriter) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramReaderDataCopyDramWriter) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::DRAM); @@ -521,7 +521,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderDataCopyDramWriter) } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderDataCopyDramWriter) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1ReaderDataCopyDramWriter) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::L1); @@ -542,7 +542,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderDataCopyDramWriter) { } } -TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderDataCopyL1Writer) { +TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramReaderDataCopyL1Writer) { for (unsigned int id = 0; id < num_devices_; id++) { BankedConfig test_config; size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::L1); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/test_bit_utils.cpp b/tests/tt_metal/tt_metal/api/test_bit_utils.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests_common/common/test_bit_utils.cpp rename to tests/tt_metal/tt_metal/api/test_bit_utils.cpp index 99f62def780..badbcaa7bca 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/test_bit_utils.cpp +++ b/tests/tt_metal/tt_metal/api/test_bit_utils.cpp @@ -6,7 +6,7 @@ #include #include -TEST(NoFixture, ExtractBitArray) { +TEST(Host, ExtractBitArray) { uint32_t src[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0}; // 1. Extract the 20-bit elements from the 32-bit source array. uint32_t dest[4]; @@ -25,7 +25,7 @@ TEST(NoFixture, ExtractBitArray) { EXPECT_EQ(dest[3], 0x9abc); } -TEST(NoFixture, PackBitArray) { +TEST(Host, PackBitArray) { uint32_t src[8] = { 1, 2, 3, 4, 5, 6, 7, 7 }; uint32_t dest[8]; @@ -56,7 +56,7 @@ TEST(NoFixture, PackBitArray) { EXPECT_EQ(dest[0], expected); } -TEST(NoFixture, PackExtractBitArray) { +TEST(Host, PackExtractBitArray) { uint32_t src[8] = { 1, 2, 3, 4, 5, 6, 7, 7 }; for (uint num_pack_bits = 3; num_pack_bits <= 31; num_pack_bits++) { @@ -70,7 +70,7 @@ TEST(NoFixture, PackExtractBitArray) { } } -TEST(NoFixture, ExtractPackBitArray) { +TEST(Host, ExtractPackBitArray) { uint32_t src[4] = { 0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0 }; // Compute the number of 3-bit elements that can be packed into 4 x 32-bit elements diff --git a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp b/tests/tt_metal/tt_metal/api/test_direct.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp rename to tests/tt_metal/tt_metal/api/test_direct.cpp index 5c86e8feadf..3defe49c6ec 100644 --- a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp +++ b/tests/tt_metal/tt_metal/api/test_direct.cpp @@ -372,7 +372,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter } } // namespace unit_tests::dram::direct -TEST_F(DeviceFixture, SingleCoreDirectDramReaderOnly) { +TEST_F(DeviceFixture, TensixSingleCoreDirectDramReaderOnly) { for (unsigned int id = 0; id < num_devices_; id++) { uint32_t l1_unreserved_base = devices_.at(id)->get_base_allocator_addr(HalMemType::L1); ASSERT_TRUE( @@ -383,7 +383,7 @@ TEST_F(DeviceFixture, SingleCoreDirectDramReaderOnly) { unit_tests::dram::direct::reader_only(devices_.at(id), 16 * 1024, l1_unreserved_base, CoreCoord(0, 0))); } } -TEST_F(DeviceFixture, SingleCoreDirectDramWriterOnly) { +TEST_F(DeviceFixture, TensixSingleCoreDirectDramWriterOnly) { for (unsigned int id = 0; id < num_devices_; id++) { uint32_t l1_unreserved_base = devices_.at(id)->get_base_allocator_addr(HalMemType::L1); ASSERT_TRUE( @@ -394,7 +394,7 @@ TEST_F(DeviceFixture, SingleCoreDirectDramWriterOnly) { unit_tests::dram::direct::writer_only(devices_.at(id), 16 * 1024, l1_unreserved_base, CoreCoord(0, 0))); } } -TEST_F(DeviceFixture, SingleCoreDirectDramReaderWriter) { +TEST_F(DeviceFixture, TensixSingleCoreDirectDramReaderWriter) { unit_tests::dram::direct::ReaderWriterConfig test_config = { .num_tiles = 1, .tile_byte_size = 2 * 32 * 32, @@ -409,7 +409,7 @@ TEST_F(DeviceFixture, SingleCoreDirectDramReaderWriter) { ASSERT_TRUE(unit_tests::dram::direct::reader_writer(devices_.at(id), test_config)); } } -TEST_F(DeviceFixture, SingleCoreDirectDramReaderDatacopyWriter) { +TEST_F(DeviceFixture, TensixSingleCoreDirectDramReaderDatacopyWriter) { unit_tests::dram::direct::ReaderDatacopyWriterConfig test_config = { .num_tiles = 1, .tile_byte_size = 2 * 32 * 32, diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram.cpp b/tests/tt_metal/tt_metal/api/test_dram.cpp similarity index 93% rename from tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram.cpp rename to tests/tt_metal/tt_metal/api/test_dram.cpp index 9b2f241bb72..463a6f4dfb1 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram.cpp +++ b/tests/tt_metal/tt_metal/api/test_dram.cpp @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "dispatch_fixture.hpp" #include "gtest/gtest.h" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" @@ -24,7 +24,7 @@ struct DRAMConfig{ tt_metal::DataMovementConfig data_movement_cfg; }; -bool dram_single_core_db (CommonFixture* fixture, tt_metal::Device *device){ +bool dram_single_core_db (DispatchFixture* fixture, tt_metal::Device *device){ tt_metal::Program program = tt_metal::CreateProgram(); CoreCoord core = {0, 0}; @@ -90,7 +90,7 @@ bool dram_single_core_db (CommonFixture* fixture, tt_metal::Device *device){ return input_vec == result_vec; } -bool dram_single_core (CommonFixture* fixture, tt_metal::Device *device, const DRAMConfig &cfg, std::vector src_vec){ +bool dram_single_core (DispatchFixture* fixture, tt_metal::Device *device, const DRAMConfig &cfg, std::vector src_vec){ // Create a program tt_metal::Program program = CreateProgram(); @@ -139,7 +139,7 @@ bool dram_single_core (CommonFixture* fixture, tt_metal::Device *device, const D } } -TEST_F(CommonFixture, DRAMLoopbackSingleCore){ +TEST_F(DispatchFixture, TensixDRAMLoopbackSingleCore){ uint32_t buffer_size = 2 * 1024 * 25; std::vector src_vec = create_random_vector_of_bfloat16( buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); @@ -155,8 +155,8 @@ TEST_F(CommonFixture, DRAMLoopbackSingleCore){ } } -TEST_F(CommonFixture, DRAMLoopbackSingleCoreDB){ - if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ +TEST_F(DispatchFixture, TensixDRAMLoopbackSingleCoreDB){ + if (!this->IsSlowDispatch()){ tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode"); GTEST_SKIP(); } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp similarity index 94% rename from tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram_to_l1_multicast.cpp rename to tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp index 39bac896ea0..3f770ff57ca 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram_to_l1_multicast.cpp +++ b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "dispatch_fixture.hpp" #include "gtest/gtest.h" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" @@ -23,7 +23,7 @@ struct DRAMtoL1MulticastConfig{ CoreCoord exclude_direction; }; -bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, const DRAMtoL1MulticastConfig &cfg){ +bool dram_to_l1_multicast(DispatchFixture* fixture, tt_metal::Device *device, const DRAMtoL1MulticastConfig &cfg){ bool pass = true; tt_metal::Program program = tt_metal::CreateProgram(); @@ -123,7 +123,7 @@ bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, cons } } -TEST_F(CommonFixture, DRAMtoL1Multicast){ +TEST_F(DispatchFixture, TensixDRAMtoL1Multicast){ unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { .dest_buffer_addr = 200 * 1024, .target_grid_offset = 1, @@ -133,7 +133,7 @@ TEST_F(CommonFixture, DRAMtoL1Multicast){ ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config)); } } -TEST_F(CommonFixture, DRAMtoL1MulticastLoopbackSrc){ +TEST_F(DispatchFixture, TensixDRAMtoL1MulticastLoopbackSrc){ unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { .dest_buffer_addr = 500 * 1024, .target_grid_offset = 0, @@ -143,7 +143,7 @@ TEST_F(CommonFixture, DRAMtoL1MulticastLoopbackSrc){ ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config)); } } -TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpLeft){ +TEST_F(DispatchFixture, TensixDRAMtoL1MulticastExcludeRegionUpLeft){ unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { .dest_buffer_addr = 200 * 1024, .target_grid_offset = 0, //source core is in exclusion zone, don't count twice @@ -160,7 +160,7 @@ TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpLeft){ } } -TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpRight){ +TEST_F(DispatchFixture, TensixDRAMtoL1MulticastExcludeRegionUpRight){ unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { .dest_buffer_addr = 200 * 1024, .target_grid_offset = 1, @@ -177,7 +177,7 @@ TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpRight){ } } -TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionDownLeft){ +TEST_F(DispatchFixture, TensixDRAMtoL1MulticastExcludeRegionDownLeft){ unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { .dest_buffer_addr = 200 * 1024, .target_grid_offset = 1, @@ -194,7 +194,7 @@ TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionDownLeft){ } } -TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionDownRight){ +TEST_F(DispatchFixture, TensixDRAMtoL1MulticastExcludeRegionDownRight){ unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = { .dest_buffer_addr = 200 * 1024, .target_grid_offset = 1, diff --git a/tests/tt_metal/tt_metal/unit_tests/global_semaphore/test_global_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp similarity index 100% rename from tests/tt_metal/tt_metal/unit_tests/global_semaphore/test_global_semaphores.cpp rename to tests/tt_metal/tt_metal/api/test_global_semaphores.cpp diff --git a/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp b/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp new file mode 100644 index 00000000000..19e8b4826af --- /dev/null +++ b/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "gtest/gtest.h" +#include "common/core_coord.hpp" +#include "dispatch_fixture.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "host_api.hpp" +#include "compile_program_with_kernel_path_env_var_fixture.hpp" + +using namespace tt; + +// Ensures we can successfully create kernels on available compute grid +TEST_F(DispatchFixture, TensixCreateKernelsOnComputeCores) { + for (unsigned int id = 0; id < this->devices_.size(); id++) { + tt_metal::Program program = CreateProgram(); + CoreCoord compute_grid = this->devices_.at(id)->compute_with_storage_grid_size(); + EXPECT_NO_THROW( + auto test_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", + CoreRange(CoreCoord(0, 0), CoreCoord(compute_grid.x, compute_grid.y)), + DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});); + } +} + +TEST_F(DispatchFixture, DISABLED_TensixCreateKernelsOnStorageCores) { + for (unsigned int id = 0; id < this->devices_.size(); id++) { + if (this->devices_.at(id)->storage_only_cores().empty()) { + GTEST_SKIP() << "This test only runs on devices with storage only cores"; + } + tt_metal::Program program = CreateProgram(); + const std::set& storage_only_cores = this->devices_.at(id)->storage_only_cores(); + std::set storage_only_core_ranges; + for (CoreCoord core : storage_only_cores) { + storage_only_core_ranges.emplace(core); + } + CoreRangeSet storage_core_range_set(storage_only_core_ranges); + EXPECT_ANY_THROW( + auto test_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", + storage_core_range_set, + DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});); + } +} + +TEST_F(DispatchFixture, DISABLED_TensixIdleEthCreateKernelsOnDispatchCores) { + if (this->IsSlowDispatch()) { + GTEST_SKIP() << "This test is only supported in fast dispatch mode"; + } + for (unsigned int id = 0; id < this->devices_.size(); id++) { + tt_metal::Program program = CreateProgram(); + Device* device = this->devices_.at(id); + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + std::vector dispatch_cores = tt::get_logical_dispatch_cores(device->id(), device->num_hw_cqs(), dispatch_core_type); + std::set dispatch_core_ranges; + for (CoreCoord core : dispatch_cores) { + dispatch_core_ranges.emplace(core); + } + CoreRangeSet dispatch_core_range_set(dispatch_core_ranges); + if (dispatch_core_type == CoreType::WORKER) { + EXPECT_ANY_THROW( + auto test_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", + CoreRangeSet(dispatch_core_range_set), + DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});); + } else if (dispatch_core_type == CoreType::ETH) { + EXPECT_ANY_THROW(auto test_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/misc/erisc_print.cpp", + CoreRangeSet(dispatch_core_range_set), + EthernetConfig{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0});); + } + } +} + +TEST_F(CompileProgramWithKernelPathEnvVarFixture, TensixKernelUnderMetalRootDir) { + const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp"; + create_kernel(kernel_file); + detail::CompileProgram(this->device_, this->program_); +} + +TEST_F(CompileProgramWithKernelPathEnvVarFixture, TensixKernelUnderKernelRootDir) { + const string &orig_kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp"; + const string &new_kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/new_kernel.cpp"; + this->setup_kernel_dir(orig_kernel_file, new_kernel_file); + this->create_kernel(new_kernel_file); + detail::CompileProgram(this->device_, this->program_); + this->cleanup_kernel_dir(); +} + +TEST_F(CompileProgramWithKernelPathEnvVarFixture, TensixKernelUnderMetalRootDirAndKernelRootDir) { + const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp"; + this->setup_kernel_dir(kernel_file, kernel_file); + this->create_kernel(kernel_file); + detail::CompileProgram(this->device_, this->program_); + this->cleanup_kernel_dir(); +} + +TEST_F(CompileProgramWithKernelPathEnvVarFixture, TensixNonExistentKernel) { + const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/non_existent_kernel.cpp"; + this->create_kernel(kernel_file); + EXPECT_THROW(detail::CompileProgram(this->device_, this->program_), std::exception); +} diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/test_noc.cpp b/tests/tt_metal/tt_metal/api/test_noc.cpp similarity index 78% rename from tests/tt_metal/tt_metal/unit_tests/basic/test_noc.cpp rename to tests/tt_metal/tt_metal/api/test_noc.cpp index 278e5289e28..65640d96a39 100644 --- a/tests/tt_metal/tt_metal/unit_tests/basic/test_noc.cpp +++ b/tests/tt_metal/tt_metal/api/test_noc.cpp @@ -4,17 +4,10 @@ #include -#include -#include -#include - -#include "basic_fixture.hpp" #include "device_fixture.hpp" #include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/host_api.hpp" +#include "host_api.hpp" #include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" -#include "tt_metal/test_utils/stimulus.hpp" using namespace tt; using namespace tt::test_utils; @@ -68,11 +61,48 @@ void read_translation_table (Device* device, CoreCoord logical_node, std::vector #endif } -} // namespace unit_tests::basic::device +} // namespace unit_tests::basic::test_noc +TEST(NOC, TensixSingleDeviceHarvestingPrints) { + auto arch = tt::get_arch_from_string(get_umd_arch_name()); + tt::tt_metal::Device* device; + const unsigned int device_id = 0; + device = tt::tt_metal::CreateDevice(device_id); + CoreCoord unharvested_logical_grid_size; + switch (arch) { + case tt::ARCH::GRAYSKULL: unharvested_logical_grid_size = CoreCoord(12, 10); break; + case tt::ARCH::WORMHOLE_B0: unharvested_logical_grid_size = CoreCoord(8, 10); break; + case tt::ARCH::BLACKHOLE: unharvested_logical_grid_size = CoreCoord(14, 10); break; + default: + TT_THROW("Unsupported arch {}", get_umd_arch_name()); + } + auto logical_grid_size = device->logical_grid_size(); + if (logical_grid_size == unharvested_logical_grid_size) { + tt::log_info("Harvesting Disabled in SW"); + } else { + tt::log_info("Harvesting Enabled in SW"); + tt::log_info("Number of Harvested Rows={}", unharvested_logical_grid_size.y - logical_grid_size.y); + } + tt::log_info("Logical -- Noc Coordinates Mapping"); + tt::log_info("[Logical <-> NOC0] Coordinates"); + for (int r = 0; r < logical_grid_size.y; r++) { + string output_row = ""; + for (int c = 0; c < logical_grid_size.x; c++) { + const CoreCoord logical_coord(c, r); + const auto noc_coord = device->worker_core_from_logical_core(logical_coord); + output_row += "{L[x" + std::to_string(c); + output_row += "-y" + std::to_string(r); + output_row += "]:N[x" + std::to_string(noc_coord.x); + output_row += "-y" + std::to_string(noc_coord.y); + output_row += "]}, "; + } + tt::log_info("{}", output_row); + } + ASSERT_TRUE(tt::tt_metal::CloseDevice(device)); +} -TEST_F(BasicFixture, VerifyNocNodeIDs) { +TEST(NOC, TensixVerifyNocNodeIDs) { auto arch = tt::get_arch_from_string(get_umd_arch_name()); tt::tt_metal::Device* device; const unsigned int device_id = 0; @@ -95,7 +125,7 @@ TEST_F(BasicFixture, VerifyNocNodeIDs) { } ASSERT_TRUE(tt::tt_metal::CloseDevice(device)); } -TEST_F(BasicFixture, VerifyNocIdentityTranslationTable) { +TEST(NOC, TensixVerifyNocIdentityTranslationTable) { auto arch = tt::get_arch_from_string(get_umd_arch_name()); if (arch == tt::ARCH::BLACKHOLE) { GTEST_SKIP(); @@ -133,14 +163,12 @@ TEST_F(BasicFixture, VerifyNocIdentityTranslationTable) { // Tests that kernel can write to and read from a stream register address // This is meant to exercise noc_inline_dw_write API -TEST_F(DeviceFixture, DirectedStreamRegWriteRead) { +TEST_F(DeviceFixture, TensixDirectedStreamRegWriteRead) { CoreCoord start_core{0, 0}; const uint32_t stream_id = 0; const uint32_t stream_reg = 4; for (tt_metal::Device *device : this->devices_) { - std::set storage_only_cores = device->storage_only_cores(); - tt_metal::Program program = tt_metal::CreateProgram(); CoreCoord logical_grid_size = device->compute_with_storage_grid_size(); CoreCoord end_core{logical_grid_size.x - 1, logical_grid_size.y - 1}; diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp b/tests/tt_metal/tt_metal/api/test_runtime_args.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp rename to tests/tt_metal/tt_metal/api/test_runtime_args.cpp index 520d04986d2..31d83f4783f 100644 --- a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp +++ b/tests/tt_metal/tt_metal/api/test_runtime_args.cpp @@ -4,16 +4,10 @@ #include -#include -#include -#include - #include "device_fixture.hpp" - +#include "kernels/kernel.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" -#include "tt_metal/impl/kernels/kernel.hpp" - using namespace tt; using namespace tt::tt_metal; @@ -168,7 +162,7 @@ bool verify_results( } // Write unique and common runtime args to device and readback to verify written correctly. -TEST_F(DeviceFixture, LegallyModifyRTArgsDataMovement) { +TEST_F(DeviceFixture, TensixLegallyModifyRTArgsDataMovement) { for (unsigned int id = 0; id < num_devices_; id++) { // First run the program with the initial runtime args CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1)); @@ -214,7 +208,7 @@ TEST_F(DeviceFixture, LegallyModifyRTArgsDataMovement) { } } -TEST_F(DeviceFixture, LegallyModifyRTArgsCompute) { +TEST_F(DeviceFixture, TensixLegallyModifyRTArgsCompute) { for (unsigned int id = 0; id < num_devices_; id++) { // First run the program with the initial runtime args CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1)); @@ -244,7 +238,7 @@ TEST_F(DeviceFixture, LegallyModifyRTArgsCompute) { } // Don't cover all cores of kernel with SetRuntimeArgs. Verify that correct offset used to access common runtime args. -TEST_F(DeviceFixture, SetRuntimeArgsSubsetOfCoresCompute) { +TEST_F(DeviceFixture, TensixSetRuntimeArgsSubsetOfCoresCompute) { for (unsigned int id = 0; id < num_devices_; id++) { // First run the program with the initial runtime args CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1)); @@ -272,7 +266,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsSubsetOfCoresCompute) { } // Different unique runtime args per core. Not overly special, but verify that it works. -TEST_F(DeviceFixture, SetRuntimeArgsUniqueValuesCompute) { +TEST_F(DeviceFixture, TensixSetRuntimeArgsUniqueValuesCompute) { for (unsigned int id = 0; id < num_devices_; id++) { // First run the program with the initial runtime args CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1)); @@ -305,7 +299,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsUniqueValuesCompute) { // Some cores have more unique runtime args than others. Unused in kernel, but API supports it, so verify it works and that // common runtime args are appropriately offset by amount from core(s) with most unique runtime args. -TEST_F(DeviceFixture, SetRuntimeArgsVaryingLengthPerCore) { +TEST_F(DeviceFixture, TensixSetRuntimeArgsVaryingLengthPerCore) { for (unsigned int id = 0; id < num_devices_; id++) { // First run the program with the initial runtime args @@ -356,7 +350,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsVaryingLengthPerCore) { } // Too many unique and common runtime args, overflows allowed space and throws expected exception from both unique/common APIs. -TEST_F(DeviceFixture, IllegalTooManyRuntimeArgs) { +TEST_F(DeviceFixture, TensixIllegalTooManyRuntimeArgs) { for (unsigned int id = 0; id < num_devices_; id++) { CoreRange first_core_range(CoreCoord(1, 1), CoreCoord(2, 2)); CoreRangeSet core_range_set(first_core_range); @@ -376,7 +370,7 @@ TEST_F(DeviceFixture, IllegalTooManyRuntimeArgs) { } } -TEST_F(DeviceFixture, IllegallyModifyRTArgs) { +TEST_F(DeviceFixture, TensixIllegallyModifyRTArgs) { for (unsigned int id = 0; id < num_devices_; id++) { // First run the program with the initial runtime args CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1)); @@ -408,7 +402,6 @@ TEST_F(DeviceFixture, IllegallyModifyRTArgs) { SetCommonRuntimeArgs(program, 0, common_runtime_args); std::vector illegal_common_runtime_args = {0, 1, 2, 3, 4, 5}; EXPECT_ANY_THROW(SetCommonRuntimeArgs(program, 0, illegal_common_runtime_args)); - } } diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_semaphores.cpp similarity index 96% rename from tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp rename to tests/tt_metal/tt_metal/api/test_semaphores.cpp index 9be219332a0..3cd83770dfa 100644 --- a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp +++ b/tests/tt_metal/tt_metal/api/test_semaphores.cpp @@ -4,10 +4,6 @@ #include -#include -#include -#include - #include "device_fixture.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/detail/util.hpp" @@ -102,7 +98,7 @@ void try_creating_more_than_max_num_semaphores( } // namespace unit_tests::initialize_semaphores -TEST_F(DeviceFixture, InitializeLegalSemaphores) { +TEST_F(DeviceFixture, TensixInitializeLegalSemaphores) { for (unsigned int id = 0; id < num_devices_; id++) { tt_metal::Program program = tt_metal::CreateProgram(); CoreRange core_range({0, 0}, {1, 1}); @@ -111,7 +107,7 @@ TEST_F(DeviceFixture, InitializeLegalSemaphores) { } } -TEST_F(DeviceFixture, InitializeIllegalSemaphores) { +TEST_F(DeviceFixture, TensixInitializeIllegalSemaphores) { for (unsigned int id = 0; id < num_devices_; id++) { tt_metal::Program program = tt_metal::CreateProgram(); CoreRange core_range({0, 0}, {1, 1}); @@ -121,7 +117,7 @@ TEST_F(DeviceFixture, InitializeIllegalSemaphores) { } } -TEST_F(DeviceFixture, CreateMultipleSemaphoresOnSameCore) { +TEST_F(DeviceFixture, TensixCreateMultipleSemaphoresOnSameCore) { tt_metal::Program program = tt_metal::CreateProgram(); CoreCoord core0(0,0); diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_sharded_l1.cpp b/tests/tt_metal/tt_metal/api/test_sharded_l1_buffer.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests/buffer/test_sharded_l1.cpp rename to tests/tt_metal/tt_metal/api/test_sharded_l1_buffer.cpp index 9b59692df8f..27c68e515de 100644 --- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_sharded_l1.cpp +++ b/tests/tt_metal/tt_metal/api/test_sharded_l1_buffer.cpp @@ -4,16 +4,10 @@ #include "device_fixture.hpp" #include "gtest/gtest.h" -#include "tt_metal/common/logger.hpp" -#include "tt_metal/common/math.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/comparison.hpp" -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" -#include "tt_metal/test_utils/stimulus.hpp" #include "tt_metal/common/constants.hpp" -#include +#include "tt_metal/test_utils/stimulus.hpp" using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_dram_buffer.cpp b/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_dram_buffer.cpp rename to tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp index 27eadb2448e..32c5e8e9255 100644 --- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_dram_buffer.cpp +++ b/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp @@ -4,14 +4,10 @@ #include "device_fixture.hpp" #include "gtest/gtest.h" -#include "test_buffer_utils.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/comparison.hpp" -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" +#include "buffer_test_utils.hpp" +#include "host_api.hpp" #include "tt_metal/test_utils/stimulus.hpp" - using tt::tt_metal::Device; using namespace tt::test_utils; using namespace tt::test::buffer::detail; diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp rename to tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp index 80146a83635..4c3cfbf3a11 100644 --- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp +++ b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp @@ -4,12 +4,9 @@ #include "device_fixture.hpp" #include "gtest/gtest.h" -#include "test_buffer_utils.hpp" -#include "tt_metal/host_api.hpp" +#include "buffer_test_utils.hpp" +#include "host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/test_utils/comparison.hpp" -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" @@ -160,7 +157,7 @@ TEST_F(DeviceFixture, TestSimpleL1BufferWriteOnlyHi) { } } -TEST_F(DeviceFixture, TestSimpleL1ReadWriteTileLo) { +TEST_F(DeviceFixture, TensixTestSimpleL1ReadWriteTileLo) { for (unsigned int id = 0; id < num_devices_; id++) { size_t lo_address = 768 * 1024; ASSERT_TRUE(SimpleTiledL1WriteCBRead( @@ -172,7 +169,7 @@ TEST_F(DeviceFixture, TestSimpleL1ReadWriteTileLo) { } } -TEST_F(DeviceFixture, TestSimpleL1ReadWriteTileHi) { +TEST_F(DeviceFixture, TensixTestSimpleL1ReadWriteTileHi) { for (unsigned int id = 0; id < num_devices_; id++) { size_t hi_address = this->devices_.at(id)->l1_size_per_core() - (24 * 1024); ASSERT_TRUE(SimpleTiledL1WriteCBRead( @@ -184,7 +181,7 @@ TEST_F(DeviceFixture, TestSimpleL1ReadWriteTileHi) { } } -TEST_F(DeviceFixture, TestSimpleL1ReadWritex2y2TileLo) { +TEST_F(DeviceFixture, TensixTestSimpleL1ReadWritex2y2TileLo) { for (unsigned int id = 0; id < num_devices_; id++) { size_t lo_address = 768 * 1024; ASSERT_TRUE(SimpleTiledL1WriteCBRead( @@ -196,7 +193,7 @@ TEST_F(DeviceFixture, TestSimpleL1ReadWritex2y2TileLo) { } } -TEST_F(DeviceFixture, TestSimpleL1ReadWritex2y2TileHi) { +TEST_F(DeviceFixture, TensixTestSimpleL1ReadWritex2y2TileHi) { for (unsigned int id = 0; id < num_devices_; id++) { size_t hi_address = this->devices_.at(id)->l1_size_per_core() - (24 * 1024); ASSERT_TRUE(SimpleTiledL1WriteCBRead( @@ -208,7 +205,7 @@ TEST_F(DeviceFixture, TestSimpleL1ReadWritex2y2TileHi) { } } -TEST_F(DeviceFixture, TestBufferL1ReadWriteTileLo) { +TEST_F(DeviceFixture, TensixTestBufferL1ReadWriteTileLo) { for (unsigned int id = 0; id < num_devices_; id++) { size_t lo_address = 768 * 1024; ASSERT_TRUE(SimpleTiledL1WriteCBRead( @@ -220,7 +217,7 @@ TEST_F(DeviceFixture, TestBufferL1ReadWriteTileLo) { } } -TEST_F(DeviceFixture, TestBufferL1ReadWriteTileHi) { +TEST_F(DeviceFixture, TensixTestBufferL1ReadWriteTileHi) { for (unsigned int id = 0; id < num_devices_; id++) { size_t hi_address = this->devices_.at(id)->l1_size_per_core() - (24 * 1024); ASSERT_TRUE(SimpleTiledL1WriteCBRead( diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/test_soc_descriptor.cpp b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests/basic/test_soc_descriptor.cpp rename to tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp index 1fb0e630fd9..657f6996b23 100644 --- a/tests/tt_metal/tt_metal/unit_tests/basic/test_soc_descriptor.cpp +++ b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp @@ -8,13 +8,10 @@ #include #include -#include "basic_fixture.hpp" #include "device_fixture.hpp" #include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/host_api.hpp" +#include "host_api.hpp" #include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" -#include "tt_metal/test_utils/stimulus.hpp" using namespace tt; using namespace tt::test_utils; @@ -43,7 +40,7 @@ namespace unit_tests::basic::soc_desc { // This test ensures that no logical core maps to a harvested row -TEST_F(BasicFixture, ValidateLogicalToPhysicalCoreCoordHostMapping) { +TEST(SOC, TensixValidateLogicalToPhysicalCoreCoordHostMapping) { size_t num_devices = tt_metal::GetNumAvailableDevices(); ASSERT_TRUE(num_devices > 0); tt::ARCH arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); @@ -67,7 +64,7 @@ TEST_F(BasicFixture, ValidateLogicalToPhysicalCoreCoordHostMapping) { } } -TEST_F(DeviceFixture, ValidateMetalSocDescriptors) { +TEST_F(DeviceFixture, TensixValidateMetalSocDescriptors) { for (chip_id_t device_id = 0; device_id < this->num_devices_; device_id++) { const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(device_id); diff --git a/tests/tt_metal/tt_metal/unit_tests/host_apis/test_tilize_untilize.cpp b/tests/tt_metal/tt_metal/api/test_tilize_untilize.cpp similarity index 84% rename from tests/tt_metal/tt_metal/unit_tests/host_apis/test_tilize_untilize.cpp rename to tests/tt_metal/tt_metal/api/test_tilize_untilize.cpp index 0c2379b4e19..040b69cc151 100644 --- a/tests/tt_metal/tt_metal/unit_tests/host_apis/test_tilize_untilize.cpp +++ b/tests/tt_metal/tt_metal/api/test_tilize_untilize.cpp @@ -4,7 +4,6 @@ #include #include -#include "tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp" #include "tt_metal/common/tilize_untilize.hpp" template @@ -35,7 +34,7 @@ void tilize_untilize_helper(uint max_num_batches, uint max_num_row_tiles, uint m } // The following run the tilize/untilize APIs and their inverses -TEST_F(BasicFixture, TestTilizeAndThenUntilizeBfloat16) { +TEST(Host, TestTilizeAndThenUntilizeBfloat16) { uint max_num_batches = 8; uint max_num_row_tiles = 8; uint max_num_col_tiles = 8; @@ -45,12 +44,12 @@ TEST_F(BasicFixture, TestTilizeAndThenUntilizeBfloat16) { tilize_untilize_helper(max_num_batches, max_num_row_tiles, max_num_col_tiles, TILE_HEIGHT, TILE_WIDTH); } -TEST_F(BasicFixture, TestTilizeThrowErrorForNonBfloat16DataType) { +TEST(Host, TestTilizeThrowErrorForNonBfloat16DataType) { std::vector vec(1024, 0); EXPECT_ANY_THROW(tilize(vec, 32, 32)); } -TEST_F(BasicFixture, TestTilizeThrowErrorForInvalidTileMandN) { +TEST(Host, TestTilizeThrowErrorForInvalidTileMandN) { // m and n are not divisible by tile size std::vector vec(16, 0); EXPECT_ANY_THROW(tilize(vec, 4, 4)); // m and n not divisible by 32 @@ -59,19 +58,19 @@ TEST_F(BasicFixture, TestTilizeThrowErrorForInvalidTileMandN) { EXPECT_ANY_THROW(tilize(vec, 0, 0)); } -TEST_F(BasicFixture, TestTilizeThrowErrorForInvalidVectorShape) { +TEST(Host, TestTilizeThrowErrorForInvalidVectorShape) { std::vector vec(16, 0); // Size not divisible by 1024 EXPECT_ANY_THROW(tilize(vec, 32, 32)); // m and n not divisible by 32 vec = {}; // Cannot have a zero vector either EXPECT_ANY_THROW(tilize(vec, 32, 32)); // m and n not divisible by 32 } -TEST_F(BasicFixture, TestUntilizeThrowErrorForNonBfloat16DataType) { +TEST(Host, TestUntilizeThrowErrorForNonBfloat16DataType) { std::vector vec(1024, 0); EXPECT_ANY_THROW(untilize(vec, 32, 32)); } -TEST_F(BasicFixture, TestUntilizeThrowErrorForInvalidTileMandN) { +TEST(Host, TestUntilizeThrowErrorForInvalidTileMandN) { // m and n are not divisible by tile side lengths std::vector vec(16, 0); EXPECT_ANY_THROW(untilize(vec, 4, 4)); @@ -80,14 +79,14 @@ TEST_F(BasicFixture, TestUntilizeThrowErrorForInvalidTileMandN) { EXPECT_ANY_THROW(untilize(vec, 0, 0)); } -TEST_F(BasicFixture, TestUntilizeThrowErrorForInvalidVectorShape) { +TEST(Host, TestUntilizeThrowErrorForInvalidVectorShape) { std::vector vec(16, 0); // Size not divisible by 1024 EXPECT_ANY_THROW(untilize(vec, 32, 32)); // m and n not divisible by 32 vec = {}; // Cannot have a zero vector either EXPECT_ANY_THROW(untilize(vec, 32, 32)); // m and n not divisible by 32 } -TEST_F(BasicFixture, TestUntilizeAndThenTilizeBfloat16) { +TEST(Host, TestUntilizeAndThenTilizeBfloat16) { uint max_num_batches = 8; uint max_num_row_tiles = 8; uint max_num_col_tiles = 8; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_worker_config_buffer.cpp b/tests/tt_metal/tt_metal/api/test_worker_config_buffer.cpp similarity index 87% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_worker_config_buffer.cpp rename to tests/tt_metal/tt_metal/api/test_worker_config_buffer.cpp index e0c4083eced..3c3a31cf314 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_worker_config_buffer.cpp +++ b/tests/tt_metal/tt_metal/api/test_worker_config_buffer.cpp @@ -1,18 +1,16 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 -#include - #include "gtest/gtest.h" #include "tt_metal/impl/dispatch/worker_config_buffer.hpp" using std::vector; using namespace tt::tt_metal; -namespace working_config_buffer_tests { +namespace worker_config_buffer_tests { -TEST(WorkingConfigBuffer, MarkCompletelyFull) { +TEST(WorkerConfigBuffer, MarkCompletelyFull) { WorkerConfigBufferMgr mgr; mgr.init_add_buffer(1024, 1024); mgr.init_add_buffer(2, 1024); @@ -56,4 +54,4 @@ TEST(WorkerConfigBuffer, SmallSize) { } } -} // namespace working_config_buffer_tests +} // namespace worker_config_buffer_tests diff --git a/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp new file mode 100644 index 00000000000..a3a18fdc229 --- /dev/null +++ b/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp @@ -0,0 +1,161 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "gtest/gtest.h" +#include "dispatch_fixture.hpp" +#include "hostdevcommon/common_values.hpp" +#include "impl/device/device.hpp" +#include "tt_cluster_descriptor_types.h" +#include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/test_utils/env_vars.hpp" +#include "tt_metal/impl/kernels/kernel.hpp" +#include "tt_metal/common/tt_backend_api_types.hpp" +#include "tt_metal/llrt/rtoptions.hpp" + +class CommandQueueFixture : public DispatchFixture { + protected: + tt::tt_metal::Device *device_; + void SetUp() override { + this->validate_dispatch_mode(); + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + this->create_device(); + } + + void TearDown() override { + if (!this->IsSlowDispatch()) { + tt::tt_metal::CloseDevice(this->device_); + } + } + + void validate_dispatch_mode() { + this->slow_dispatch_ = false; + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + tt::log_info( + tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + this->slow_dispatch_ = true; + GTEST_SKIP(); + } + } + + void create_device(const size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE) { + const chip_id_t device_id = 0; + const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); + this->device_ = + tt::tt_metal::CreateDevice(device_id, 1, DEFAULT_L1_SMALL_SIZE, trace_region_size, dispatch_core_type); + } +}; + +class CommandQueueEventFixture : public CommandQueueFixture {}; + +class CommandQueueBufferFixture : public CommandQueueFixture {}; + +class CommandQueueProgramFixture : public CommandQueueFixture {}; + +class CommandQueueTraceFixture : public CommandQueueFixture { + protected: + void SetUp() override { + this->validate_dispatch_mode(); + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + } + + void CreateDevice(const size_t trace_region_size) { + this->create_device(trace_region_size); + } +}; + +class CommandQueueSingleCardFixture : virtual public DispatchFixture { + protected: + void SetUp() override { + this->validate_dispatch_mode(); + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + this->create_devices(); + } + + void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); } + + void validate_dispatch_mode() { + this->slow_dispatch_ = false; + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + tt::log_info( + tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + this->slow_dispatch_ = false; + GTEST_SKIP(); + } + } + + void create_devices(const std::size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE) { + const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); + const chip_id_t mmio_device_id = 0; + this->reserved_devices_ = tt::tt_metal::detail::CreateDevices( + {mmio_device_id}, 1, DEFAULT_L1_SMALL_SIZE, trace_region_size, dispatch_core_type); + auto enable_remote_chip = getenv("TT_METAL_ENABLE_REMOTE_CHIP"); + if (enable_remote_chip) { + for (const auto &[id, device] : this->reserved_devices_) { + this->devices_.push_back(device); + } + } else { + this->devices_.push_back(this->reserved_devices_.at(mmio_device_id)); + } + } + + std::vector devices_; + std::map reserved_devices_; +}; + +class CommandQueueSingleCardBufferFixture : public CommandQueueSingleCardFixture {}; + +class CommandQueueSingleCardTraceFixture : virtual public CommandQueueSingleCardFixture { + protected: + void SetUp() override { + this->validate_dispatch_mode(); + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + this->create_devices(90000000); + } +}; + +class CommandQueueSingleCardProgramFixture : virtual public CommandQueueSingleCardFixture {}; + +class CommandQueueMultiDeviceFixture : public DispatchFixture { + protected: + void SetUp() override { + this->slow_dispatch_ = false; + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + this->slow_dispatch_ = true; + GTEST_SKIP(); + } + + arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + + num_devices_ = tt::tt_metal::GetNumAvailableDevices(); + if (num_devices_ < 2 ) { + GTEST_SKIP(); + } + + std::vector chip_ids; + for (unsigned int id = 0; id < num_devices_; id++) { + chip_ids.push_back(id); + } + + const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); + reserved_devices_ = tt::tt_metal::detail::CreateDevices(chip_ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + for (const auto &[id, device] : reserved_devices_) { + devices_.push_back(device); + } + } + + void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); } + + std::vector devices_; + std::map reserved_devices_; + size_t num_devices_; +}; + +class CommandQueueMultiDeviceProgramFixture : public CommandQueueMultiDeviceFixture {}; diff --git a/tests/tt_metal/tt_metal/common/device_fixture.hpp b/tests/tt_metal/tt_metal/common/device_fixture.hpp new file mode 100644 index 00000000000..c0b086f07c8 --- /dev/null +++ b/tests/tt_metal/tt_metal/common/device_fixture.hpp @@ -0,0 +1,102 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "dispatch_fixture.hpp" +#include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/test_utils/env_vars.hpp" +#include "tt_metal/impl/device/device_pool.hpp" + +class DeviceFixture : public DispatchFixture { + protected: + void SetUp() override { + this->validate_dispatch_mode(); + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + + // Some CI machines have lots of cards, running all tests on all cards is slow + // Coverage for multidevices is decent if we just confirm 2 work + this->num_devices_ = tt::tt_metal::GetNumAvailableDevices(); + if (arch_ == tt::ARCH::GRAYSKULL && num_devices_ > 2) { + this->num_devices_ = 2; + } + + std::vector ids; + for (unsigned int id = 0; id < num_devices_; id++) { + ids.push_back(id); + } + this->create_devices(ids); + } + + void validate_dispatch_mode() { + this->slow_dispatch_ = true; + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (!slow_dispatch) { + tt::log_info( + tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE set"); + this->slow_dispatch_ = false; + GTEST_SKIP(); + } + } + + void create_devices(const std::vector& device_ids) { + const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); + tt::DevicePool::initialize(device_ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + this->devices_ = tt::DevicePool::instance().get_all_active_devices(); + this->num_devices_ = this->devices_.size(); + } + + size_t num_devices_; +}; + +class DeviceSingleCardFixture : public DispatchFixture { + protected: + void SetUp() override { + this->validate_dispatch_mode(); + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + this->create_devices(); + } + + void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); } + + void validate_dispatch_mode() { + this->slow_dispatch_ = true; + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (!slow_dispatch) { + tt::log_info( + tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE set"); + this->slow_dispatch_ = false; + GTEST_SKIP(); + } + } + + void create_devices() { + const chip_id_t mmio_device_id = 0; + this->reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id}); + this->device_ = this->reserved_devices_.at(mmio_device_id); + this->devices_ = tt::DevicePool::instance().get_all_active_devices(); + this->num_devices_ = this->reserved_devices_.size(); + } + + tt::tt_metal::Device *device_; + std::map reserved_devices_; + size_t num_devices_; +}; + +class DeviceSingleCardBufferFixture : public DeviceSingleCardFixture {}; + +class BlackholeSingleCardFixture : public DeviceSingleCardFixture { + protected: + void SetUp() override { + this->validate_dispatch_mode(); + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name()); + if (this->arch_ != tt::ARCH::BLACKHOLE) { + GTEST_SKIP(); + } + this->create_devices(); + } +}; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp similarity index 88% rename from tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp rename to tests/tt_metal/tt_metal/common/dispatch_fixture.hpp index 1b1b4d6104f..546311661f6 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -14,16 +14,24 @@ #include "tt_metal/impl/device/device_pool.hpp" // A dispatch-agnostic test fixture -class CommonFixture: public ::testing::Test { -public: +class DispatchFixture : public ::testing::Test { + public: // A function to run a program, according to which dispatch mode is set. - void RunProgram(tt::tt_metal::Device* device, tt::tt_metal::Program& program) { + void RunProgram(tt::tt_metal::Device* device, tt::tt_metal::Program& program, const bool skip_finish = false) { const uint64_t program_id = program.get_id(); if (this->slow_dispatch_) { tt::tt_metal::detail::LaunchProgram(device, program); } else { tt::tt_metal::CommandQueue& cq = device->command_queue(); tt::tt_metal::EnqueueProgram(cq, program, false); + if (!skip_finish) { + tt::tt_metal::Finish(cq); + } + } + } + void FinishCommands(tt::tt_metal::Device* device) { + if (!this->IsSlowDispatch()) { + tt::tt_metal::CommandQueue& cq = device->command_queue(); tt::tt_metal::Finish(cq); } } @@ -51,26 +59,12 @@ class CommonFixture: public ::testing::Test { tt::ARCH arch_; std::vector devices_; bool slow_dispatch_; - bool has_remote_devices_; void SetUp() override { - // Skip for slow dispatch for now - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch) { - tt::log_info(tt::LogTest, "Running test using Slow Dispatch"); - slow_dispatch_ = true; - } else { - tt::log_info(tt::LogTest, "Running test using Fast Dispatch"); - slow_dispatch_ = false; - } - + this->DetectDispatchMode(); // Set up all available devices this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); - auto num_pci_devices = tt::tt_metal::GetNumPCIeDevices(); - // An extra flag for if we have remote devices, as some tests are disabled for fast - // dispatch + remote devices. - this->has_remote_devices_ = num_devices > num_pci_devices; std::vector ids; for (unsigned int id = 0; id < num_devices; id++) { if (SkipTest(id)) @@ -118,4 +112,15 @@ class CommonFixture: public ::testing::Test { run_function(); log_info(tt::LogTest, "Finished running test on device {}.", device->id()); } + + void DetectDispatchMode() { + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + tt::log_info(tt::LogTest, "Running test using Slow Dispatch"); + this->slow_dispatch_ = true; + } else { + tt::log_info(tt::LogTest, "Running test using Fast Dispatch"); + this->slow_dispatch_ = false; + } + } }; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp b/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp rename to tests/tt_metal/tt_metal/common/matmul_test_utils.hpp index 4eecec45c61..f1cc9436ccb 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp +++ b/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp @@ -4,14 +4,9 @@ #pragma once -#include -#include -#include - -#include "tt_metal/host_api.hpp" +#include "host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "common/bfloat16.hpp" -#include "tt_metal/test_utils/deprecated/tensor.hpp" #include "tt_metal/common/test_tiles.hpp" #include "hostdevcommon/common_values.hpp" #include "tt_metal/impl/dispatch/command_queue.hpp" diff --git a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp new file mode 100644 index 00000000000..7f0b4c7a17d --- /dev/null +++ b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp @@ -0,0 +1,50 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "host_api.hpp" +#include "dispatch_fixture.hpp" +#include "tt_cluster_descriptor_types.h" +#include "tt_metal/test_utils/env_vars.hpp" +#include "tt_metal/impl/device/device_pool.hpp" + +class MultiDeviceFixture : public DispatchFixture { + protected: + void SetUp() override { + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + } +}; + +class N300DeviceFixture : public MultiDeviceFixture { + protected: + void SetUp() override { + this->slow_dispatch_ = true; + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (!slow_dispatch) { + tt::log_info(tt::LogTest, "This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set"); + this->slow_dispatch_ = false; + GTEST_SKIP(); + } + + MultiDeviceFixture::SetUp(); + + const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); + const size_t num_pci_devices = tt::tt_metal::GetNumPCIeDevices(); + if (this->arch_ == tt::ARCH::WORMHOLE_B0 && num_devices == 2 && num_pci_devices == 1) { + std::vector ids; + for (chip_id_t id = 0; id < num_devices; id++) { + ids.push_back(id); + } + + const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); + tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + this->devices_ = tt::DevicePool::instance().get_all_active_devices(); + } else { + GTEST_SKIP(); + } + } +}; diff --git a/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt b/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt similarity index 50% rename from tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt rename to tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt index 9f4257918f4..9a445e323d4 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt @@ -1,14 +1,4 @@ -set(UNIT_TESTS_COMMON_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/basic/test_device_init.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/common/test_bit_utils.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/common/test_dispatch.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_flatten.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_large_block.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_multi_core_X_dram.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_single_core.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_X_tile.cpp +set(UNIT_TESTS_DEBUG_TOOLS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_eth_cores.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_invalid_print_core.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_mute_device.cpp @@ -17,36 +7,35 @@ set(UNIT_TESTS_COMMON_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_before_finish.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_hanging.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_tensix_dest.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_raise_wait.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_tiles.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dram/test_dram_to_l1_multicast.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dram/test_dram.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_raise_wait.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_assert.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_noc_sanitize.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_link_training.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_noc_sanitize_delays.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_noc_sanitize.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_pause.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_ringbuf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_waypoint.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_link_training.cpp -) -add_library(unit_tests_common_o STATIC ${UNIT_TESTS_COMMON_SRC}) -TT_ENABLE_UNITY_BUILD(unit_tests_common_o) -target_link_libraries( - unit_tests_common_o - PUBLIC - gtest - gtest_main - magic_enum - fmt::fmt-header-only - span ) + +add_executable(unit_tests_debug_tools ${UNIT_TESTS_DEBUG_TOOLS_SRC}) +TT_ENABLE_UNITY_BUILD(unit_tests_debug_tools) + +target_link_libraries(unit_tests_debug_tools PUBLIC test_metal_common_libs) target_include_directories( - unit_tests_common_o - PUBLIC - $ + unit_tests_debug_tools + PRIVATE ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/tt_metal ${PROJECT_SOURCE_DIR}/tt_metal/common ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/common ) +set_target_properties( + unit_tests_debug_tools + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp b/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp similarity index 53% rename from tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp rename to tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp index 9d74f94942d..f8189d9c98e 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp +++ b/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp @@ -4,14 +4,110 @@ #pragma once -#include -#include -#include "common_fixture.hpp" -#include "impl/debug/watcher_server.hpp" -#include "llrt/rtoptions.hpp" - -// A version of CommonFixture with watcher enabled -class WatcherFixture: public CommonFixture { +#include +#include "debug/watcher_server.hpp" +#include "dispatch_fixture.hpp" +#include "tt_metal/tt_metal/common/dispatch_fixture.hpp" + +class DebugToolsFixture : public DispatchFixture { + protected: + bool watcher_previous_enabled; + + void TearDown() override { + DispatchFixture::TearDown(); + tt::llrt::OptionsG.set_watcher_enabled(watcher_previous_enabled); + } + + template + void RunTestOnDevice(const std::function& run_function, Device* device) { + auto run_function_no_args = [=]() { run_function(static_cast(this), device); }; + DispatchFixture::RunTestOnDevice(run_function_no_args, device); + } +}; + +// A version of DispatchFixture with DPrint enabled on all cores. +class DPrintFixture : public DebugToolsFixture { +public: + inline static const string dprint_file_name = "gtest_dprint_log.txt"; + + // A function to run a program, according to which dispatch mode is set. + void RunProgram(Device* device, Program& program) { + // Only difference is that we need to wait for the print server to catch + // up after running a test. + DebugToolsFixture::RunProgram(device, program); + tt::DprintServerAwait(); + } + +protected: + // Running with dprint + watcher enabled can make the code size blow up, so let's force watcher + // disabled for DPRINT tests. + void SetUp() override { + // The core range (physical) needs to be set >= the set of all cores + // used by all tests using this fixture, so set dprint enabled for + // all cores and all devices + tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, true); + tt::llrt::OptionsG.set_feature_all_cores( + tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, tt::llrt::RunTimeDebugClassWorker); + tt::llrt::OptionsG.set_feature_all_cores( + tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, tt::llrt::RunTimeDebugClassWorker); + tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, true); + // Send output to a file so the test can check after program is run. + tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, dprint_file_name); + tt::llrt::OptionsG.set_test_mode_enabled(true); + watcher_previous_enabled = tt::llrt::OptionsG.get_watcher_enabled(); + tt::llrt::OptionsG.set_watcher_enabled(false); + + ExtraSetUp(); + + // Parent class initializes devices and any necessary flags + DebugToolsFixture::SetUp(); + } + + void TearDown() override { + // Parent class tears down devices + DebugToolsFixture::TearDown(); + + // Remove the DPrint output file after the test is finished. + std::remove(dprint_file_name.c_str()); + + // Reset DPrint settings + tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, {}); + tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false); + tt::llrt::OptionsG.set_feature_all_cores( + tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, tt::llrt::RunTimeDebugClassNoneSpecified); + tt::llrt::OptionsG.set_feature_all_cores( + tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, tt::llrt::RunTimeDebugClassNoneSpecified); + tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false); + tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, ""); + tt::llrt::OptionsG.set_test_mode_enabled(false); + } + + void RunTestOnDevice( + const std::function& run_function, + Device* device + ) { + DebugToolsFixture::RunTestOnDevice(run_function, device); + tt::DPrintServerClearLogFile(); + tt::DPrintServerClearSignals(); + } + + // Override this function in child classes for additional setup commands between DPRINT setup + // and device creation. + virtual void ExtraSetUp() {} +}; + +// For usage by tests that need the dprint server devices disabled. +class DPrintDisableDevicesFixture : public DPrintFixture { +protected: + void ExtraSetUp() override { + // For this test, mute each devices using the environment variable + tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false); + tt::llrt::OptionsG.set_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint, {}); + } +}; + +// A version of DispatchFixture with watcher enabled +class WatcherFixture : public DebugToolsFixture { public: inline static const string log_file_name = "generated/watcher/watcher.log"; inline static const int interval_ms = 250; @@ -20,7 +116,7 @@ class WatcherFixture: public CommonFixture { void RunProgram(Device* device, Program& program, bool wait_for_dump = false) { // Only difference is that we need to wait for the print server to catch // up after running a test. - CommonFixture::RunProgram(device, program); + DebugToolsFixture::RunProgram(device, program); // Wait for watcher to run a full dump before finishing, need to wait for dump count to // increase because we'll likely check in the middle of a dump. @@ -31,7 +127,6 @@ class WatcherFixture: public CommonFixture { } protected: - bool watcher_previous_enabled; int watcher_previous_interval; bool watcher_previous_dump_all; bool watcher_previous_append; @@ -57,15 +152,14 @@ class WatcherFixture: public CommonFixture { tt::watcher_clear_log(); // Parent class initializes devices and any necessary flags - CommonFixture::SetUp(); + DebugToolsFixture::SetUp(); } void TearDown() override { // Parent class tears down devices - CommonFixture::TearDown(); + DebugToolsFixture::TearDown(); // Reset watcher settings to their previous values - tt::llrt::OptionsG.set_watcher_enabled(watcher_previous_enabled); tt::llrt::OptionsG.set_watcher_interval(watcher_previous_interval); tt::llrt::OptionsG.set_watcher_dump_all(watcher_previous_dump_all); tt::llrt::OptionsG.set_watcher_append(watcher_previous_append); @@ -79,10 +173,7 @@ class WatcherFixture: public CommonFixture { const std::function& run_function, Device* device ) { - auto run_function_no_args = [=]() { - run_function(this, device); - }; - CommonFixture::RunTestOnDevice(run_function_no_args, device); + DebugToolsFixture::RunTestOnDevice(run_function, device); // Wait for a final watcher poll and then clear the log. std::this_thread::sleep_for(std::chrono::milliseconds(interval_ms)); tt::watcher_clear_log(); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp b/tests/tt_metal/tt_metal/debug_tools/debug_tools_test_utils.hpp similarity index 75% rename from tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp rename to tests/tt_metal/tt_metal/debug_tools/debug_tools_test_utils.hpp index e7074237636..64359886fd8 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp +++ b/tests/tt_metal/tt_metal/debug_tools/debug_tools_test_utils.hpp @@ -3,61 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include -#include -#include "impl/kernels/kernel.hpp" -inline std::pair, std::vector> create_runtime_args( - const uint32_t num_unique_rt_args, - const uint32_t num_common_rt_args, - const uint32_t unique_base, - const uint32_t common_base) { - TT_FATAL( - num_unique_rt_args + num_common_rt_args <= tt::tt_metal::max_runtime_args, - "Number of unique runtime args and common runtime args exceeds the maximum limit of {} runtime args", - tt::tt_metal::max_runtime_args); - - std::vector common_rt_args; - for (uint32_t i = 0; i < num_common_rt_args; i++) { - common_rt_args.push_back(common_base + i); - } - - std::vector unique_rt_args; - for (uint32_t i = 0; i < num_unique_rt_args; i++) { - unique_rt_args.push_back(unique_base + i); - } - - return std::make_pair(unique_rt_args, common_rt_args); -} - -// Create randomly sized pair of unique and common runtime args vectors, with careful not to exceed max between the two. -// Optionally force the max size for one of the vectors. -inline std::pair, std::vector> create_runtime_args( - const bool force_max_size = false, const uint32_t unique_base = 0, const uint32_t common_base = 100) { - uint32_t num_rt_args_unique = rand() % (tt::tt_metal::max_runtime_args + 1); - uint32_t num_rt_args_common = - num_rt_args_unique < tt::tt_metal::max_runtime_args ? rand() % (tt::tt_metal::max_runtime_args - num_rt_args_unique + 1) : 0; - - if (force_max_size) { - if (rand() % 2) { - num_rt_args_unique = tt::tt_metal::max_runtime_args; - num_rt_args_common = 0; - } else { - num_rt_args_common = tt::tt_metal::max_runtime_args; - num_rt_args_unique = 0; - } - } - - log_trace( - tt::LogTest, - "{} - num_rt_args_unique: {} num_rt_args_common: {} force_max_size: {}", - __FUNCTION__, - num_rt_args_unique, - num_rt_args_common, - force_max_size); - - return create_runtime_args(num_rt_args_unique, num_rt_args_common, unique_base, common_base); -} +#include "host_api.hpp" // Helper function to open a file as an fstream, and check that it was opened properly. inline bool OpenFile(string &file_name, std::fstream &file_stream, std::ios_base::openmode mode) { diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_eth_cores.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_eth_cores.cpp index 38ece0f5ca0..e97d6eef743 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_eth_cores.cpp @@ -2,9 +2,9 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "dprint_fixture.hpp" +#include "debug_tools_fixture.hpp" #include "gtest/gtest.h" -#include "test_utils.hpp" +#include "debug_tools_test_utils.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" @@ -81,7 +81,7 @@ static void RunTest(DPrintFixture* fixture, Device* device, bool active) { } } -TEST_F(DPrintFixture, TestPrintEthCores) { +TEST_F(DPrintFixture, ActiveEthTestPrint) { for (Device* device : this->devices_) { // Skip if no ethernet cores on this device if (device->get_active_ethernet_cores(true).size() == 0) { @@ -96,7 +96,7 @@ TEST_F(DPrintFixture, TestPrintEthCores) { ); } } -TEST_F(DPrintFixture, TestPrintIEthCores) { +TEST_F(DPrintFixture, IdleEthTestPrint) { if (!this->IsSlowDispatch()) { log_info(tt::LogTest, "FD-on-idle-eth not supported."); GTEST_SKIP(); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_invalid_print_core.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp similarity index 65% rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_invalid_print_core.cpp rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp index 11b89c90dfe..47ba5193765 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_invalid_print_core.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp @@ -2,9 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 #include "gtest/gtest.h" +#include "debug_tools_fixture.hpp" #include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/impl/dispatch/command_queue.hpp" #include "tt_metal/llrt/rtoptions.hpp" ////////////////////////////////////////////////////////////////////////////////////////// @@ -12,22 +11,17 @@ ////////////////////////////////////////////////////////////////////////////////////////// using namespace tt::tt_metal; -TEST(DPrintErrorChecking, TestPrintInvalidCore) { +TEST_F(DPrintFixture, TensixTestPrintInvalidCore) { // Set DPRINT enabled on a mix of invalid and valid cores. Previously this would hang during // device setup, but not the print server should simply ignore the invalid cores. std::map> dprint_cores; dprint_cores[CoreType::WORKER] = {{0, 0}, {1, 1}, {100, 100}}; tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, dprint_cores); - tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, true); - - const int device_id = 0; - Device* device = nullptr; - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - device = tt::tt_metal::CreateDevice(device_id, tt::llrt::OptionsG.get_num_hw_cqs(), DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); // We expect that even though illegal worker cores were requested, device setup did not hang. // So just make sure that device setup worked and then close the device. - EXPECT_TRUE(device != nullptr); + for (Device* device : this->devices_) { + EXPECT_TRUE(device != nullptr); + } tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false); - tt::tt_metal::CloseDevice(device); } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_device.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_device.cpp index 8440e242ac6..d359a40e7c9 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_device.cpp @@ -2,9 +2,9 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "dprint_fixture.hpp" +#include "debug_tools_fixture.hpp" #include "common/bfloat16.hpp" -#include "test_utils.hpp" +#include "debug_tools_test_utils.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" @@ -77,7 +77,7 @@ static void RunTest(DPrintFixture* fixture, Device* device) { } } -TEST_F(DPrintFixtureDisableDevices, TestPrintMuteDevice) { +TEST_F(DPrintDisableDevicesFixture, TensixTestPrintMuteDevice) { for (Device* device : this->devices_) { this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device); } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_print_server.cpp similarity index 94% rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_print_server.cpp index 3798288e27c..8d158464401 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_print_server.cpp @@ -1,10 +1,10 @@ // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 -#include "dprint_fixture.hpp" +#include "debug_tools_fixture.hpp" #include "gtest/gtest.h" #include "impl/debug/dprint_server.hpp" -#include "test_utils.hpp" +#include "debug_tools_test_utils.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" @@ -66,7 +66,7 @@ static void RunTest(DPrintFixture* fixture, Device* device) { } } -TEST_F(DPrintFixture, TestPrintMuting) { +TEST_F(DPrintFixture, TensixTestPrintMuting) { for (Device* device : this->devices_) { this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device); } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp index 42d7382b5bb..2cdadb41f35 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp @@ -2,10 +2,10 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "dprint_fixture.hpp" +#include "debug_tools_fixture.hpp" #include "common/bfloat16.hpp" #include "gtest/gtest.h" -#include "test_utils.hpp" +#include "debug_tools_test_utils.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" @@ -192,7 +192,7 @@ static void RunTest(DPrintFixture* fixture, Device* device) { } } -TEST_F(DPrintFixture, TestPrintFromAllHarts) { +TEST_F(DPrintFixture, TensixTestPrintFromAllHarts) { for (Device* device : this->devices_) { this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device); } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_before_finish.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_print_before_finish.cpp index 0370b51f3f2..9e7a775b9c5 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_before_finish.cpp @@ -1,8 +1,8 @@ // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 -#include "dprint_fixture.hpp" -#include "test_utils.hpp" +#include "debug_tools_fixture.hpp" +#include "debug_tools_test_utils.hpp" ////////////////////////////////////////////////////////////////////////////////////////// // A test for checking that the finish command can wait for the last dprint. @@ -58,7 +58,7 @@ static void RunTest(DPrintFixture* fixture, Device* device) { ); } -TEST_F(DPrintFixture, TestPrintFinish) { +TEST_F(DPrintFixture, TensixTestPrintFinish) { auto devices = this->devices_; // Run only on the first device, as this tests disconnects devices and this can cause // issues on multi-device setups. diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_hanging.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_print_hanging.cpp index a707ffff86c..00a0252e7a8 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_hanging.cpp @@ -2,10 +2,10 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "dprint_fixture.hpp" +#include "debug_tools_fixture.hpp" #include "common/bfloat16.hpp" #include "gtest/gtest.h" -#include "test_utils.hpp" +#include "debug_tools_test_utils.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" @@ -56,7 +56,7 @@ try { } } -TEST_F(DPrintFixture, TestPrintHanging) { +TEST_F(DPrintFixture, TensixTestPrintHanging) { // Skip this test for slow dipatch for now. Due to how llrt currently sits below device, it's // tricky to check print server status from the finish loop for slow dispatch. Once issue #4363 // is resolved, we should add a check for print server handing in slow dispatch as well. diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp index 1f73a7bc736..7d8ec61dd30 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp @@ -3,14 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 #include "common/bfloat16.hpp" -#include "dprint_fixture.hpp" +#include "debug_tools_fixture.hpp" #include "gtest/gtest.h" -#include "test_utils.hpp" +#include "debug_tools_test_utils.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/comparison.hpp" #include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" ////////////////////////////////////////////////////////////////////////////////////////// // A test for checking dprint @@ -230,7 +228,7 @@ static bool reader_datacopy_writer( return input_data == output_data; } -TEST_F(DPrintFixture, TestDestPrintFloat16b) { +TEST_F(DPrintFixture, TensixTestDestPrintFloat16b) { // Setup test configuration DestPrintTestConfig test_config = { .num_tiles = 2, @@ -246,7 +244,7 @@ TEST_F(DPrintFixture, TestDestPrintFloat16b) { this->devices_[0]); } -TEST_F(DPrintFixture, TestDestPrintFloat32) { +TEST_F(DPrintFixture, TensixTestDestPrintFloat32) { // Setup test configuration DestPrintTestConfig test_config = { .num_tiles = 2, @@ -266,7 +264,7 @@ TEST_F(DPrintFixture, TestDestPrintFloat32) { this->devices_[0]); } -TEST_F(DPrintFixture, TestDestPrintFloat32RemapAndSwizzle) { +TEST_F(DPrintFixture, TensixTestDestPrintFloat32RemapAndSwizzle) { // Setup test configuration DestPrintTestConfig test_config = { .num_tiles = 3, diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tiles.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp similarity index 99% rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tiles.cpp rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp index 6a0210a9ef6..a4af2d278e0 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tiles.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "dprint_fixture.hpp" -#include "test_utils.hpp" +#include "debug_tools_fixture.hpp" +#include "debug_tools_test_utils.hpp" #include "common/bfloat8.hpp" #include "common/bfloat4.hpp" diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_raise_wait.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_raise_wait.cpp index 0786c960813..05ae9069dec 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_raise_wait.cpp @@ -2,9 +2,9 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "dprint_fixture.hpp" +#include "debug_tools_fixture.hpp" #include "gtest/gtest.h" -#include "test_utils.hpp" +#include "debug_tools_test_utils.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" @@ -279,7 +279,7 @@ static void RunTest(DPrintFixture* fixture, Device* device) { } } -TEST_F(DPrintFixture, TestPrintRaiseWait) { +TEST_F(DPrintFixture, TensixTestPrintRaiseWait) { for (Device* device : this->devices_) { this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device); } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_assert.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp similarity index 91% rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_assert.cpp rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp index 8f5ca5efc46..25fd8be5c26 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_assert.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "watcher_fixture.hpp" -#include "test_utils.hpp" +#include "debug_tools_fixture.hpp" +#include "debug_tools_test_utils.hpp" ////////////////////////////////////////////////////////////////////////////////////////// // A test for checking watcher asserts. @@ -11,6 +11,7 @@ using namespace tt; using namespace tt::tt_metal; +namespace CMAKE_UNIQUE_NAMESPACE { static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_type) { // Set up program Program program = Program(); @@ -176,8 +177,10 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty log_info(LogTest, "Reported error: {}", exception); EXPECT_TRUE(expected == get_watcher_exception_message()); } +} -TEST_F(WatcherFixture, TestWatcherAssertBrisc) { +TEST_F(WatcherFixture, TensixTestWatcherAssertBrisc) { + using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -188,7 +191,8 @@ TEST_F(WatcherFixture, TestWatcherAssertBrisc) { ); } -TEST_F(WatcherFixture, TestWatcherAssertNCrisc) { +TEST_F(WatcherFixture, TensixTestWatcherAssertNCrisc) { + using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( @@ -197,7 +201,8 @@ TEST_F(WatcherFixture, TestWatcherAssertNCrisc) { ); } -TEST_F(WatcherFixture, TestWatcherAssertTrisc0) { +TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc0) { + using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( @@ -206,7 +211,8 @@ TEST_F(WatcherFixture, TestWatcherAssertTrisc0) { ); } -TEST_F(WatcherFixture, TestWatcherAssertTrisc1) { +TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc1) { + using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( @@ -215,7 +221,8 @@ TEST_F(WatcherFixture, TestWatcherAssertTrisc1) { ); } -TEST_F(WatcherFixture, TestWatcherAssertTrisc2) { +TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc2) { + using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( @@ -224,7 +231,8 @@ TEST_F(WatcherFixture, TestWatcherAssertTrisc2) { ); } -TEST_F(WatcherFixture, TestWatcherAssertErisc) { +TEST_F(WatcherFixture, ActiveEthTestWatcherAssertErisc) { + using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( @@ -233,7 +241,8 @@ TEST_F(WatcherFixture, TestWatcherAssertErisc) { ); } -TEST_F(WatcherFixture, TestWatcherAssertIErisc) { +TEST_F(WatcherFixture, IdleEthTestWatcherAssertIErisc) { + using namespace CMAKE_UNIQUE_NAMESPACE; if (!this->IsSlowDispatch()) { log_info(tt::LogTest, "FD-on-idle-eth not supported."); GTEST_SKIP(); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_link_training.cpp similarity index 93% rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_link_training.cpp index dd23509745b..043bc8682fa 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_link_training.cpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "watcher_fixture.hpp" -#include "test_utils.hpp" +#include "debug_tools_fixture.hpp" +#include "debug_tools_test_utils.hpp" ////////////////////////////////////////////////////////////////////////////////////////// // A test for checking watcher polling the eth link training counter. @@ -15,7 +15,7 @@ using namespace tt::tt_metal; static void RunTest(WatcherFixture* fixture, Device* device) { } -TEST_F(WatcherFixture, TestWatcherEthLinkCheck) { +TEST_F(WatcherFixture, ActiveEthTestWatcherEthLinkCheck) { // Eth link retraining only supported on WH for now, this test is also dispatch-agnostic so just pick one. if (this->slow_dispatch_ || this->arch_ != tt::ARCH::WORMHOLE_B0 || this->devices_.size() == 1) { log_info(LogTest, "Test only runs on fast dispatch + multi-chip WH, skipping..."); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp similarity index 96% rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize.cpp rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp index 416ffece9bd..8f656da7fd6 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "watcher_fixture.hpp" -#include "test_utils.hpp" +#include "debug_tools_fixture.hpp" +#include "debug_tools_test_utils.hpp" #include "llrt/llrt.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" @@ -225,7 +225,7 @@ void CheckHostSanitization(Device *device) { } } -TEST_F(WatcherFixture, TestWatcherSanitize) { +TEST_F(WatcherFixture, TensixTestWatcherSanitize) { // Skip this test for slow dipatch for now. Due to how llrt currently sits below device, it's // tricky to check watcher server status from the finish loop for slow dispatch. Once issue #4363 // is resolved, we should add a check for print server handing in slow dispatch as well. @@ -244,7 +244,7 @@ TEST_F(WatcherFixture, TestWatcherSanitize) { ); } -TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentL1) { +TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentL1) { if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( @@ -256,7 +256,7 @@ TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentL1) { ); } -TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentDRAM) { +TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentDRAM) { if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( @@ -268,7 +268,7 @@ TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentDRAM) { ); } -TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentDRAMNCrisc) { +TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentDRAMNCrisc) { if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( @@ -280,13 +280,13 @@ TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentDRAMNCrisc) { ); } -TEST_F(WatcherFixture, TestWatcherSanitizeEth) { +TEST_F(WatcherFixture, ActiveEthTestWatcherSanitizeEth) { if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice(RunTestEth, this->devices_[0]); } -TEST_F(WatcherFixture, TestWatcherSanitizeIEth) { +TEST_F(WatcherFixture, IdleEthTestWatcherSanitizeIEth) { if (!this->IsSlowDispatch()) { log_info(tt::LogTest, "FD-on-idle-eth not supported."); GTEST_SKIP(); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp index 600872d58ac..52bfc59a354 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp @@ -3,8 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #include "llrt/rtoptions.hpp" -#include "watcher_fixture.hpp" -#include "test_utils.hpp" +#include "debug_tools_fixture.hpp" +#include "debug_tools_test_utils.hpp" #include "llrt/llrt.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" @@ -154,7 +154,7 @@ void RunDelayTestOnCore(WatcherDelayFixture* fixture, Device* device, CoreCoord EXPECT_TRUE((read_vec[0] >> 24) == 0x3); } -TEST_F(WatcherDelayFixture, TestWatcherSanitizeInsertDelays) { +TEST_F(WatcherDelayFixture, TensixTestWatcherSanitizeInsertDelays) { if (this->slow_dispatch_) GTEST_SKIP(); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp index f358a30ebad..fb70bc91700 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "watcher_fixture.hpp" -#include "test_utils.hpp" +#include "debug_tools_fixture.hpp" +#include "debug_tools_test_utils.hpp" ////////////////////////////////////////////////////////////////////////////////////////// // A test for checking watcher pause feature. @@ -134,7 +134,7 @@ static void RunTest(WatcherFixture* fixture, Device* device) { } } -TEST_F(WatcherFixture, TestWatcherPause) { +TEST_F(WatcherFixture, TensixTestWatcherPause) { for (Device* device : this->devices_) { this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device); } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_ringbuf.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp similarity index 88% rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_ringbuf.cpp rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp index cc2727ef71d..97ed9adef75 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_ringbuf.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "watcher_fixture.hpp" -#include "test_utils.hpp" +#include "debug_tools_fixture.hpp" +#include "debug_tools_test_utils.hpp" ////////////////////////////////////////////////////////////////////////////////////////// // A test for checking debug ring buffer feature. @@ -20,6 +20,7 @@ std::vector expected = { "]" }; +namespace CMAKE_UNIQUE_NAMESPACE { static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_type) { // Set up program Program program = Program(); @@ -141,8 +142,10 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty ) ); } +} -TEST_F(WatcherFixture, TestWatcherRingBufferBrisc) { +TEST_F(WatcherFixture, TensixTestWatcherRingBufferBrisc) { + using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugBrisc);}, @@ -150,7 +153,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferBrisc) { ); } } -TEST_F(WatcherFixture, TestWatcherRingBufferNCrisc) { +TEST_F(WatcherFixture, TensixTestWatcherRingBufferNCrisc) { + using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugNCrisc);}, @@ -158,7 +162,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferNCrisc) { ); } } -TEST_F(WatcherFixture, TestWatcherRingBufferTrisc0) { +TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc0) { + using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugTrisc0);}, @@ -166,7 +171,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferTrisc0) { ); } } -TEST_F(WatcherFixture, TestWatcherRingBufferTrisc1) { +TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc1) { + using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugTrisc1);}, @@ -174,7 +180,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferTrisc1) { ); } } -TEST_F(WatcherFixture, TestWatcherRingBufferTrisc2) { +TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc2) { + using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugTrisc2);}, @@ -182,7 +189,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferTrisc2) { ); } } -TEST_F(WatcherFixture, TestWatcherRingBufferErisc) { +TEST_F(WatcherFixture, ActiveEthTestWatcherRingBufferErisc) { + using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugErisc);}, @@ -190,7 +198,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferErisc) { ); } } -TEST_F(WatcherFixture, TestWatcherRingBufferIErisc) { +TEST_F(WatcherFixture, IdleEthTestWatcherRingBufferIErisc) { + using namespace CMAKE_UNIQUE_NAMESPACE; if (!this->IsSlowDispatch()) { log_info(tt::LogTest, "FD-on-idle-eth not supported."); GTEST_SKIP(); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp similarity index 99% rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp index 8da13273c27..60a1ffc1dcd 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "watcher_fixture.hpp" -#include "test_utils.hpp" +#include "debug_tools_fixture.hpp" +#include "debug_tools_test_utils.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" diff --git a/tests/tt_metal/tt_metal/device/CMakeLists.txt b/tests/tt_metal/tt_metal/device/CMakeLists.txt new file mode 100644 index 00000000000..d1b29149f67 --- /dev/null +++ b/tests/tt_metal/tt_metal/device/CMakeLists.txt @@ -0,0 +1,29 @@ +set(UNIT_TESTS_DEVICE_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/test_device_cluster_api.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_device_init_and_teardown.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_device_pool.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_galaxy_cluster_api.cpp +) + +add_executable(unit_tests_device ${UNIT_TESTS_DEVICE_SRC}) +TT_ENABLE_UNITY_BUILD(unit_tests_device) + +target_link_libraries(unit_tests_device PUBLIC test_metal_common_libs) +target_include_directories( + unit_tests_device + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) +set_target_properties( + unit_tests_device + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) diff --git a/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp b/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp new file mode 100644 index 00000000000..bca695fa95e --- /dev/null +++ b/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/impl/device/device_pool.hpp" +#include "multi_device_fixture.hpp" + +class GalaxyFixture : public MultiDeviceFixture { + protected: + void SkipTestSuiteIfNotGalaxyMotherboard() + { + const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); + if (!(this->arch_ == tt::ARCH::WORMHOLE_B0 && num_devices >= 32)) + { + GTEST_SKIP(); + } + } + + void InitializeDevices() + { + const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); + std::vector ids; + for (uint32_t id = 0; id < num_devices; id++) + { + ids.push_back(id); + } + this->device_ids_to_devices_ = tt::tt_metal::detail::CreateDevices(ids); + this->devices_ = tt::DevicePool::instance().get_all_active_devices(); + } + + void SetUp() override + { + MultiDeviceFixture::SetUp(); + this->DetectDispatchMode(); + this->SkipTestSuiteIfNotGalaxyMotherboard(); + this->InitializeDevices(); + } + + void TearDown() override + { + tt::tt_metal::detail::CloseDevices(this->device_ids_to_devices_); + this->device_ids_to_devices_.clear(); + this->devices_.clear(); + } + + private: + std::map device_ids_to_devices_; +}; + +class TGFixture : public GalaxyFixture +{ + protected: + void SkipTestSuiteIfNotTG() + { + this->SkipTestSuiteIfNotGalaxyMotherboard(); + const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); + const size_t num_pcie_devices = tt::tt_metal::GetNumPCIeDevices(); + if (!(num_devices == 32 && num_pcie_devices == 4)) + { + GTEST_SKIP(); + } + } + + void SetUp() override + { + MultiDeviceFixture::SetUp(); + this->DetectDispatchMode(); + this->SkipTestSuiteIfNotTG(); + this->InitializeDevices(); + } +}; + +class TGGFixture : public GalaxyFixture +{ + protected: + void SkipTestSuiteIfNotTGG() + { + this->SkipTestSuiteIfNotGalaxyMotherboard(); + const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); + const size_t num_pcie_devices = tt::tt_metal::GetNumPCIeDevices(); + if (!(num_devices == 64 && num_pcie_devices == 8)) + { + GTEST_SKIP(); + } + } + + void SetUp() override + { + MultiDeviceFixture::SetUp(); + this->DetectDispatchMode(); + this->SkipTestSuiteIfNotTGG(); + this->InitializeDevices(); + } +}; diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp b/tests/tt_metal/tt_metal/device/test_device.cpp similarity index 85% rename from tests/tt_metal/tt_metal/unit_tests/basic/device.cpp rename to tests/tt_metal/tt_metal/device/test_device.cpp index 4dc272cfb24..1137a2edeb3 100644 --- a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp +++ b/tests/tt_metal/tt_metal/device/test_device.cpp @@ -4,16 +4,9 @@ #include -#include -#include -#include - -#include "tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp" -#include "tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp" +#include "device_fixture.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" using namespace tt; @@ -81,46 +74,6 @@ bool dram_ping( } } // namespace unit_tests::basic::device -TEST_F(BasicFixture, SingleDeviceHarvestingPrints) { - auto arch = tt::get_arch_from_string(get_umd_arch_name()); - tt::tt_metal::Device* device; - const unsigned int device_id = 0; - device = tt::tt_metal::CreateDevice(device_id); - CoreCoord unharvested_logical_grid_size; - switch (arch) { - case tt::ARCH::GRAYSKULL: unharvested_logical_grid_size = CoreCoord(12, 10); break; - case tt::ARCH::WORMHOLE_B0: unharvested_logical_grid_size = CoreCoord(8, 10); break; - case tt::ARCH::BLACKHOLE: unharvested_logical_grid_size = CoreCoord(14, 10); break; - default: - TT_THROW("Unsupported arch {}", get_umd_arch_name()); - } - auto logical_grid_size = device->logical_grid_size(); - if (logical_grid_size == unharvested_logical_grid_size) { - tt::log_info("Harvesting Disabled in SW"); - } else { - tt::log_info("Harvesting Enabled in SW"); - tt::log_info("Number of Harvested Rows={}", unharvested_logical_grid_size.y - logical_grid_size.y); - } - - tt::log_info("Logical -- Noc Coordinates Mapping"); - tt::log_info("[Logical <-> NOC0] Coordinates"); - for (int r = 0; r < logical_grid_size.y; r++) { - string output_row = ""; - for (int c = 0; c < logical_grid_size.x; c++) { - const CoreCoord logical_coord(c, r); - const auto noc_coord = device->worker_core_from_logical_core(logical_coord); - output_row += "{L[x" + std::to_string(c); - output_row += "-y" + std::to_string(r); - output_row += "]:N[x" + std::to_string(noc_coord.x); - output_row += "-y" + std::to_string(noc_coord.y); - output_row += "]}, "; - } - tt::log_info("{}", output_row); - } - ASSERT_TRUE(tt::tt_metal::CloseDevice(device)); -} - - TEST_F(DeviceFixture, PingAllLegalDramChannels) { for (unsigned int id = 0; id < num_devices_; id++) { { @@ -163,7 +116,7 @@ TEST_F(DeviceFixture, PingIllegalDramChannels) { } } -TEST_F(DeviceFixture, PingAllLegalL1Cores) { +TEST_F(DeviceFixture, TensixPingAllLegalL1Cores) { for (unsigned int id = 0; id < num_devices_; id++) { { size_t start_byte_address = devices_.at(id)->get_base_allocator_addr(HalMemType::L1); @@ -198,7 +151,7 @@ TEST_F(DeviceFixture, PingAllLegalL1Cores) { } } -TEST_F(DeviceFixture, PingIllegalL1Cores) { +TEST_F(DeviceFixture, TensixPingIllegalL1Cores) { for (unsigned int id = 0; id < num_devices_; id++) { auto grid_size = devices_.at(id)->logical_grid_size(); grid_size.x++; @@ -215,7 +168,7 @@ TEST_F(DeviceFixture, PingIllegalL1Cores) { // 2. Launch a kernel to read and increment the value in each bank // 3. Host validates that the value from step 1 has been incremented // Purpose of this test is to ensure that L1 reader/writer APIs do not target harvested cores -TEST_F(DeviceFixture, ValidateKernelDoesNotTargetHarvestedCores) { +TEST_F(DeviceFixture, TensixValidateKernelDoesNotTargetHarvestedCores) { for (unsigned int id = 0; id < num_devices_; id++) { uint32_t num_l1_banks = this->devices_.at(id)->num_banks(BufferType::L1); std::vector host_input(1); @@ -280,7 +233,7 @@ TEST_F(DeviceFixture, TestDeviceToHostMemChannelAssignment) { } // Test to ensure writing from 16B aligned L1 address to 16B aligned PCIe address works -TEST_F(DeviceFixture, TestL1ToPCIeAt16BAlignedAddress) { +TEST_F(DeviceFixture, TensixTestL1ToPCIeAt16BAlignedAddress) { tt_metal::Program program = tt_metal::CreateProgram(); Device *device = this->devices_.at(0); EXPECT_TRUE(device->is_mmio_capable()); diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/device_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp similarity index 93% rename from tests/tt_metal/tt_metal/unit_tests/ethernet/device_cluster_api.cpp rename to tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp index 312b65a63d9..b846ead88e3 100644 --- a/tests/tt_metal/tt_metal/unit_tests/ethernet/device_cluster_api.cpp +++ b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp @@ -5,15 +5,10 @@ #include #include -#include -#include -#include "n300_device_fixture.hpp" +#include "multi_device_fixture.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" -#include "tt_metal/test_utils/stimulus.hpp" using namespace tt; using namespace tt::test_utils; @@ -23,7 +18,7 @@ namespace unit_tests::multichip::cluster { // Run this on Nebula X2 only, validate etherent core apis are correct // Known connectivity: chip 0 (x=9, y=6) <--> chip 1 (x=9, y=0) // chip 0 (x=1, y=6) <--> chip 1 (x=1, y=0) -TEST_F(N300DeviceFixture, ValidateEthernetConnectivity) { +TEST_F(N300DeviceFixture, EthValidateEthernetConnectivity) { const auto& device_0 = this->devices_.at(0); const auto& device_1 = this->devices_.at(1); @@ -79,13 +74,13 @@ TEST_F(N300DeviceFixture, ValidateEthernetConnectivity) { ASSERT_TRUE(chip_1_eth_noc_coords_returned == chip_1_eth_noc_coords_expected); } -TEST_F(N300DeviceFixture, InvalidLogicalEthernetCore) { +TEST_F(N300DeviceFixture, EthInvalidLogicalEthernetCore) { const auto& device_0 = this->devices_.at(0); EXPECT_ANY_THROW(device_0->ethernet_core_from_logical_core(CoreCoord(1, 0))); EXPECT_ANY_THROW(device_0->ethernet_core_from_logical_core(CoreCoord(0, 16))); } -TEST_F(N300DeviceFixture, ValidateAllEthernetCoreMapping) { +TEST_F(N300DeviceFixture, EthValidateAllEthernetCoreMapping) { static std::map expected_mapping_logical_to_physical = { {CoreCoord(0, 0), CoreCoord(9, 0)}, {CoreCoord(0, 1), CoreCoord(1, 0)}, @@ -112,7 +107,7 @@ TEST_F(N300DeviceFixture, ValidateAllEthernetCoreMapping) { } } -TEST_F(N300DeviceFixture, ValidatePhysicalCoreConversion) { +TEST_F(N300DeviceFixture, EthValidatePhysicalCoreConversion) { static std::map expected_mapping_logical_to_physical = { {CoreCoord(0, 0), CoreCoord(9, 0)}, {CoreCoord(0, 1), CoreCoord(1, 0)}, @@ -141,7 +136,7 @@ TEST_F(N300DeviceFixture, ValidatePhysicalCoreConversion) { EXPECT_ANY_THROW(device_0->physical_core_from_logical_core(CoreCoord(0, 0), CoreType::PCIE)); } -TEST_F(N300DeviceFixture, ValidateEthernetSockets) { +TEST_F(N300DeviceFixture, ActiveEthValidateEthernetSockets) { const auto& device_0 = this->devices_.at(0); const auto& device_1 = this->devices_.at(1); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp similarity index 93% rename from tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp rename to tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp index f4dfae4d653..44974e34b10 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp +++ b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp @@ -4,16 +4,10 @@ #include -#include -#include -#include - #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/impl/dispatch/command_queue.hpp" #include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" -#include "tt_metal/test_utils/stimulus.hpp" #include "tt_metal/impl/device/device.hpp" #include "tt_metal/impl/device/device_pool.hpp" @@ -61,7 +55,6 @@ bool load_all_blank_kernels(tt_metal::Device *device) { CreateKernel(program, "tt_metal/kernels/compute/blank.cpp", all_cores, ComputeConfig{}); unit_tests_common::basic::test_device_init::launch_program(device, program); - // tt_metal::detail::LaunchProgram(device, program); return pass; } } // namespace unit_tests_common::basic::test_device_init @@ -92,7 +85,7 @@ TEST_P(DeviceParamFixture, DeviceInitializeAndTeardown) { } } -TEST_P(DeviceParamFixture, DeviceLoadBlankKernels) { +TEST_P(DeviceParamFixture, TensixDeviceLoadBlankKernels) { unsigned int num_devices = GetParam(); unsigned int num_pci_devices = tt::tt_metal::GetNumPCIeDevices(); if ((arch == tt::ARCH::GRAYSKULL && num_devices > 1) || (num_devices > num_pci_devices)) { diff --git a/tests/tt_metal/tt_metal/device/test_device_pool.cpp b/tests/tt_metal/tt_metal/device/test_device_pool.cpp new file mode 100644 index 00000000000..b1b5cc94822 --- /dev/null +++ b/tests/tt_metal/tt_metal/device/test_device_pool.cpp @@ -0,0 +1,131 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "host_api.hpp" +#include "impl/device/device_pool.hpp" + +using namespace tt; + +TEST(DevicePool, DevicePoolOpenClose) { + std::vector device_ids{0}; + int num_hw_cqs = 1; + int l1_small_size = 1024; + const auto& dispatch_core_type = llrt::OptionsG.get_dispatch_core_type(); + DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + auto devices = DevicePool::instance().get_all_active_devices(); + for (const auto& dev : devices) { + ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); + ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); + ASSERT_TRUE(dev->is_initialized()); + } + + // Close then get devices again + for (const auto& dev : devices) { + dev->close(); + } + devices = DevicePool::instance().get_all_active_devices(); + for (const auto& dev : devices) { + ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); + ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); + ASSERT_TRUE(dev->is_initialized()); + } + for (const auto& dev : devices) { + dev->close(); + } +} + +TEST(DevicePool, DevicePoolReconfigDevices) { + std::vector device_ids{0}; + int num_hw_cqs = 1; + int l1_small_size = 1024; + const auto& dispatch_core_type = llrt::OptionsG.get_dispatch_core_type(); + DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + auto devices = DevicePool::instance().get_all_active_devices(); + for (const auto& dev : devices) { + ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); + ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); + ASSERT_TRUE(dev->is_initialized()); + } + + // Close then get devices with different configs + for (const auto& dev : devices) { + dev->close(); + } + l1_small_size = 2048; + DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + devices = DevicePool::instance().get_all_active_devices(); + for (const auto& dev : devices) { + ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); + ASSERT_TRUE(dev->is_initialized()); + } + for (const auto& dev : devices) { + dev->close(); + } +} + +TEST(DevicePool, DevicePoolAddDevices) { + if (tt_metal::GetNumAvailableDevices() != 8) { + GTEST_SKIP(); + } + std::vector device_ids{0}; + int num_hw_cqs = 1; + int l1_small_size = 1024; + const auto& dispatch_core_type = llrt::OptionsG.get_dispatch_core_type(); + DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + auto devices = DevicePool::instance().get_all_active_devices(); + for (const auto& dev : devices) { + ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); + ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); + ASSERT_TRUE(dev->is_initialized()); + } + + // Close then get more devices + for (const auto& dev : devices) { + dev->close(); + } + device_ids = {0, 1, 2, 3}; + DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + devices = DevicePool::instance().get_all_active_devices(); + ASSERT_TRUE(devices.size() >= 4); + for (const auto& dev : devices) { + ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); + ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); + ASSERT_TRUE(dev->is_initialized()); + } + for (const auto& dev : devices) { + dev->close(); + } +} + +TEST(DevicePool, DevicePoolReduceDevices) { + if (tt_metal::GetNumAvailableDevices() != 8) { + GTEST_SKIP(); + } + std::vector device_ids{0, 1, 2, 3}; + int num_hw_cqs = 1; + int l1_small_size = 1024; + const auto& dispatch_core_type = llrt::OptionsG.get_dispatch_core_type(); + DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + const auto devices = DevicePool::instance().get_all_active_devices(); + for (const auto& dev : devices) { + ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); + ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); + ASSERT_TRUE(dev->is_initialized()); + } + + // Close then get less devices + for (const auto& dev : devices) { + dev->close(); + } + device_ids = {0}; + DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + auto dev = DevicePool::instance().get_active_device(0); + ASSERT_TRUE(dev->id() == 0); + ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); + ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); + ASSERT_TRUE(dev->is_initialized()); + DevicePool::instance().close_device(0); +} diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/galaxy_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests/ethernet/galaxy_cluster_api.cpp rename to tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp index 447758371e8..27a551cce0f 100644 --- a/tests/tt_metal/tt_metal/unit_tests/ethernet/galaxy_cluster_api.cpp +++ b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp @@ -4,11 +4,7 @@ #include -#include -#include -#include - -#include "device_fixture.hpp" +#include "galaxy_fixture.hpp" #include "tt_metal/llrt/tt_cluster.hpp" #include "tt_metal/host_api.hpp" @@ -51,7 +47,7 @@ std::unordered_set get_ethernet_connected_device_ids(const chip_id_t // shelves and 4 links between adjacent Galaxy chips that are on the same // shelf, and currently tt::Cluster does not expose a way of determining // which shelf a particular Galaxy chip is on. -TEST_F(TGFixture, ValidateNumLinksBetweenAdjacentGalaxyChips) { +TEST_F(TGFixture, ActiveEthValidateNumLinksBetweenAdjacentGalaxyChips) { for (Device* device : this->devices_) { const chip_id_t device_id = device->id(); @@ -85,7 +81,7 @@ TEST_F(TGFixture, ValidateNumLinksBetweenAdjacentGalaxyChips) { // Validate that each MMIO chip links to two separate Galaxy chips, // and that each Galaxy chip links to at most one MMIO chip -TEST_F(GalaxyFixture, ValidateLinksBetweenMMIOAndGalaxyChips) { +TEST_F(GalaxyFixture, ActiveEthValidateLinksBetweenMMIOAndGalaxyChips) { for (Device* device : this->devices_) { const chip_id_t device_id = device->id(); diff --git a/tests/tt_metal/tt_metal/dispatch/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/CMakeLists.txt new file mode 100644 index 00000000000..8ab4924c4f8 --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/CMakeLists.txt @@ -0,0 +1,33 @@ +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_buffer) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_event) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_program) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_trace) + +add_executable( + unit_tests_dispatch + $ + $ + $ + $ +) +TT_ENABLE_UNITY_BUILD(unit_tests_dispatch) + +target_link_libraries(unit_tests_dispatch PUBLIC test_metal_common_libs) +target_include_directories( + unit_tests_dispatch + PRIVATE + ${UMD_HOME} + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) +set_target_properties( + unit_tests_dispatch + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/CMakeLists.txt new file mode 100644 index 00000000000..710e490c74a --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/CMakeLists.txt @@ -0,0 +1,34 @@ +set(UNIT_TESTS_DISPATCH_BUFFER_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sub_device.cpp +) + +add_library(unit_tests_dispatch_buffer_o STATIC ${UNIT_TESTS_DISPATCH_BUFFER_SRC}) + +target_link_libraries(unit_tests_dispatch_buffer_o PRIVATE test_metal_common_libs) + +target_include_directories( + unit_tests_dispatch_buffer_o + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) + +add_executable(unit_tests_dispatch_buffer $) + +target_link_libraries(unit_tests_dispatch_buffer PRIVATE test_metal_common_libs) + +set_target_properties( + unit_tests_dispatch_buffer + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) + +TT_ENABLE_UNITY_BUILD(unit_tests_dispatch_buffer) diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp similarity index 63% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp rename to tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp index 7cab67ff6d6..a453ca0074e 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp @@ -5,12 +5,11 @@ #include #include "command_queue_fixture.hpp" -#include "command_queue_test_utils.hpp" +#include "multi_command_queue_fixture.hpp" +#include "dispatch_test_utils.hpp" #include "gtest/gtest.h" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/impl/device/device.hpp" using std::vector; @@ -323,12 +322,91 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap( return pass; } +bool test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(Device* device, vector>& cqs, const TestBufferConfig& config) { + bool pass = true; + for (const bool use_void_star_api: {true, false}) { + + size_t buf_size = config.num_pages * config.page_size; + std::vector> buffers; + std::vector> srcs; + for (uint i = 0; i < cqs.size(); i++) { + buffers.push_back(Buffer::create(device, buf_size, config.page_size, config.buftype)); + srcs.push_back(generate_arange_vector(buffers[i]->size())); + if (use_void_star_api) { + EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i].data(), false); + } else { + EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i], false); + } + } + + for (uint i = 0; i < cqs.size(); i++) { + std::vector result; + if (use_void_star_api) { + result.resize(buf_size / sizeof(uint32_t)); + EnqueueReadBuffer(cqs[i], *buffers[i], result.data(), true); + } else { + EnqueueReadBuffer(cqs[i], *buffers[i], result, true); + } + bool local_pass = (srcs[i] == result); + pass &= local_pass; + } + } + + return pass; +} + } // end namespace local_test_functions namespace basic_tests { namespace dram_tests { -TEST_F(CommandQueueSingleCardFixture, WriteOneTileToDramBank0) { +TEST_F(CommandQueueBufferFixture, DISABLED_TestAsyncBufferRW) { + // Test Async Enqueue Read and Write + Get Addr + Buffer Allocation and Deallocation + auto &command_queue = this->device_->command_queue(); + auto current_mode = CommandQueue::default_mode(); + command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); + Program program; + for (int j = 0; j < 10; j++) { + // Asynchronously initialize a buffer on device + uint32_t first_buf_value = j + 1; + uint32_t second_buf_value = j + 2; + uint32_t first_buf_size = 4096; + uint32_t second_buf_size = 2048; + // Asynchronously allocate buffer on device + std::shared_ptr buffer = + Buffer::create(this->device_, first_buf_size, first_buf_size, BufferType::DRAM); + std::shared_ptr allocated_buffer_address = std::make_shared(); + EnqueueGetBufferAddr(this->device_->command_queue(), allocated_buffer_address.get(), buffer.get(), true); + // Ensure returned addr is correct + EXPECT_EQ((*allocated_buffer_address), buffer->address()); + + std::shared_ptr> vec = + std::make_shared>(first_buf_size / 4, first_buf_value); + std::vector readback_vec = {}; + // Write first vector to existing on device buffer. + EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false); + // Reallocate the vector in the main thread after asynchronously pushing it (ensure that worker still has access + // to this data) + vec = std::make_shared>(second_buf_size / 4, second_buf_value); + // Simulate what tt-eager does: Share buffer ownership with program + AssignGlobalBufferToProgram(buffer, program); + // Reallocate buffer (this is safe, since the program also owns the existing buffer, which will not be + // deallocated) + buffer = Buffer::create(this->device_, second_buf_size, second_buf_size, BufferType::DRAM); + // Write second vector to second buffer + EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false); + // Have main thread give up ownership immediately after writing + vec.reset(); + // Read both buffer and ensure data is correct + EnqueueReadBuffer(this->device_->command_queue(), buffer, readback_vec, true); + for (int i = 0; i < readback_vec.size(); i++) { + EXPECT_EQ(readback_vec[i], second_buf_value); + } + } + command_queue.set_mode(current_mode); +} + +TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileToDramBank0) { TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM}; for (Device *device : devices_) { tt::log_info("Running On Device {}", device->id()); @@ -336,7 +414,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileToDramBank0) { } } -TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllDramBanks) { +TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileToAllDramBanks) { for (Device *device : devices_) { TestBufferConfig config = { .num_pages = uint32_t(device->num_banks(BufferType::DRAM)), .page_size = 2048, .buftype = BufferType::DRAM}; @@ -345,7 +423,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllDramBanks) { } } -TEST_F(CommandQueueSingleCardFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) { +TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) { constexpr uint32_t num_round_robins = 2; for (Device *device : devices_) { TestBufferConfig config = { @@ -356,7 +434,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRo } } -TEST_F(CommandQueueSingleCardFixture, Sending131072Pages) { +TEST_F(CommandQueueSingleCardBufferFixture, Sending131072Pages) { for (Device *device : devices_) { TestBufferConfig config = {.num_pages = 131072, .page_size = 128, .buftype = BufferType::DRAM}; tt::log_info("Running On Device {}", device->id()); @@ -364,7 +442,7 @@ TEST_F(CommandQueueSingleCardFixture, Sending131072Pages) { } } -TEST_F(CommandQueueSingleCardFixture, TestPageLargerThanAndUnalignedToTransferPage) { +TEST_F(CommandQueueSingleCardBufferFixture, TestPageLargerThanAndUnalignedToTransferPage) { constexpr uint32_t num_round_robins = 2; for (Device *device : devices_) { TestBufferConfig config = { @@ -376,7 +454,7 @@ TEST_F(CommandQueueSingleCardFixture, TestPageLargerThanAndUnalignedToTransferPa } } -TEST_F(CommandQueueSingleCardFixture, TestPageLargerThanMaxPrefetchCommandSize) { +TEST_F(CommandQueueSingleCardBufferFixture, TestPageLargerThanMaxPrefetchCommandSize) { constexpr uint32_t num_round_robins = 1; for (Device *device : devices_) { CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); @@ -390,7 +468,7 @@ TEST_F(CommandQueueSingleCardFixture, TestPageLargerThanMaxPrefetchCommandSize) } } -TEST_F(CommandQueueSingleCardFixture, TestUnalignedPageLargerThanMaxPrefetchCommandSize) { +TEST_F(CommandQueueSingleCardBufferFixture, TestUnalignedPageLargerThanMaxPrefetchCommandSize) { constexpr uint32_t num_round_robins = 1; for (Device *device : devices_) { CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); @@ -405,7 +483,7 @@ TEST_F(CommandQueueSingleCardFixture, TestUnalignedPageLargerThanMaxPrefetchComm } } -TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForDram) { +TEST_F(CommandQueueSingleCardBufferFixture, TestNon32BAlignedPageSizeForDram) { TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM}; for (Device *device : devices_) { @@ -413,7 +491,7 @@ TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForDram) { } } -TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForDram2) { +TEST_F(CommandQueueSingleCardBufferFixture, TestNon32BAlignedPageSizeForDram2) { // From stable diffusion read buffer TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM}; @@ -422,16 +500,18 @@ TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForDram2) { } } -TEST_F(CommandQueueFixture, TestPageSizeTooLarge) { +TEST_F(CommandQueueSingleCardBufferFixture, TestPageSizeTooLarge) { // Should throw a host error due to the page size not fitting in the consumer CB TestBufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM}; - EXPECT_ANY_THROW((local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer( - this->device_, this->device_->command_queue(), config))); + for (Device *device : devices_) { + EXPECT_ANY_THROW((local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer( + device, device->command_queue(), config))); + } } // Requires enqueue write buffer -TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) { +TEST_F(CommandQueueSingleCardBufferFixture, TestWrapHostHugepageOnEnqueueReadBuffer) { for (Device *device : this->devices_) { tt::log_info("Running On Device {}", device->id()); uint32_t page_size = 2048; @@ -449,7 +529,7 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) { } } -TEST_F(CommandQueueSingleCardFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) { +TEST_F(CommandQueueSingleCardBufferFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) { for (Device *device : this->devices_) { tt::log_info("Running On Device {}", device->id()); uint32_t page_size = 2048; @@ -464,7 +544,7 @@ TEST_F(CommandQueueSingleCardFixture, TestIssueMultipleReadWriteCommandsForOneBu } // Test that command queue wraps when buffer available space in completion region is less than a page -TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) { +TEST_F(CommandQueueSingleCardBufferFixture, TestWrapCompletionQOnInsufficientSpace) { uint32_t large_page_size = 8192; // page size for first and third read uint32_t small_page_size = 2048; // page size for second read @@ -503,7 +583,7 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) { // Test that command queue wraps when buffer read needs to be split into multiple enqueue_read_buffer commands and // available space in completion region is less than a page -TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace2) { +TEST_F(CommandQueueSingleCardBufferFixture, TestWrapCompletionQOnInsufficientSpace2) { // Using default 75-25 issue and completion queue split for (Device *device : devices_) { tt::log_info("Running On Device {}", device->id()); @@ -536,18 +616,211 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace2) { // TODO: add test for wrapping with non aligned page sizes +TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileToDramBank0) { + TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM}; + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } + +} + +TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileToAllDramBanks) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + TestBufferConfig config = { + .num_pages = uint32_t(device->num_banks(BufferType::DRAM)), + .page_size = 2048, + .buftype = BufferType::DRAM}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) { + constexpr uint32_t num_round_robins = 2; + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + TestBufferConfig config = { + .num_pages = num_round_robins * (device->num_banks(BufferType::DRAM)), + .page_size = 2048, + .buftype = BufferType::DRAM}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceBufferFixture, Sending131072Pages) { + // Was a failing case where we used to accidentally program cb num pages to be total + // pages instead of cb num pages. + TestBufferConfig config = { + .num_pages = 131072, + .page_size = 128, + .buftype = BufferType::DRAM}; + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceBufferFixture, TestNon32BAlignedPageSizeForDram) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceBufferFixture, TestNon32BAlignedPageSizeForDram2) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + // From stable diffusion read buffer + TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceBufferFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + uint32_t page_size = 2048; + uint32_t command_queue_size = device->sysmem_manager().get_cq_size(); + uint32_t num_pages = command_queue_size / page_size; + + TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileToDramBank0) { + TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM}; + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileToAllDramBanks) { + TestBufferConfig config = { + .num_pages = uint32_t(this->device_->num_banks(BufferType::DRAM)), + .page_size = 2048, + .buftype = BufferType::DRAM}; + + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) { + constexpr uint32_t num_round_robins = 2; + TestBufferConfig config = { + .num_pages = num_round_robins * (this->device_->num_banks(BufferType::DRAM)), + .page_size = 2048, + .buftype = BufferType::DRAM}; + + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, Sending131072Pages) { + // Was a failing case where we used to accidentally program cb num pages to be total + // pages instead of cb num pages. + TestBufferConfig config = { + .num_pages = 131072, + .page_size = 128, + .buftype = BufferType::DRAM}; + + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestNon32BAlignedPageSizeForDram) { + TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM}; + + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestNon32BAlignedPageSizeForDram2) { + // From stable diffusion read buffer + TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM}; + + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestPageSizeTooLarge) { + if (this->arch_ == tt::ARCH::WORMHOLE_B0) { + GTEST_SKIP(); // This test hanging on wormhole b0 + } + // Should throw a host error due to the page size not fitting in the consumer CB + TestBufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM}; + + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_ANY_THROW(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) { + uint32_t page_size = 2048; + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id()); + uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), channel); + uint32_t num_pages = command_queue_size / page_size; + + TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM}; + + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + } // end namespace dram_tests namespace l1_tests { -TEST_F(CommandQueueSingleCardFixture, WriteOneTileToL1Bank0) { +TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileToL1Bank0) { TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1}; for (Device *device : devices_) { local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config); } } -TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllL1Banks) { +TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileToAllL1Banks) { for (Device *device : devices_) { auto compute_with_storage_grid = device->compute_with_storage_grid_size(); TestBufferConfig config = { @@ -559,7 +832,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllL1Banks) { } } -TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) { +TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) { for (Device *device : devices_) { auto compute_with_storage_grid = device->compute_with_storage_grid_size(); TestBufferConfig config = { @@ -571,7 +844,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) { } } -TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForL1) { +TEST_F(CommandQueueSingleCardBufferFixture, TestNon32BAlignedPageSizeForL1) { TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1}; for (Device *device : devices_) { @@ -582,7 +855,7 @@ TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForL1) { } } -TEST_F(CommandQueueSingleCardFixture, TestBackToBackNon32BAlignedPageSize) { +TEST_F(CommandQueueSingleCardBufferFixture, TestBackToBackNon32BAlignedPageSize) { constexpr BufferType buff_type = BufferType::L1; for (Device *device : devices_) { @@ -606,7 +879,7 @@ TEST_F(CommandQueueSingleCardFixture, TestBackToBackNon32BAlignedPageSize) { } // This case was failing for FD v1.3 design -TEST_F(CommandQueueSingleCardFixture, TestLargeBuffer4096BPageSize) { +TEST_F(CommandQueueSingleCardBufferFixture, TestLargeBuffer4096BPageSize) { constexpr BufferType buff_type = BufferType::L1; for (Device *device : devices_) { @@ -616,9 +889,107 @@ TEST_F(CommandQueueSingleCardFixture, TestLargeBuffer4096BPageSize) { } } +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileToL1Bank0) { + TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1}; + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileToAllL1Banks) { + auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size(); + TestBufferConfig config = { + .num_pages = uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y), + .page_size = 2048, + .buftype = BufferType::L1}; + + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) { + auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size(); + TestBufferConfig config = { + .num_pages = 2 * uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y), + .page_size = 2048, + .buftype = BufferType::L1}; + + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestNon32BAlignedPageSizeForL1) { + TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1}; + + CommandQueue& a = this->device_->command_queue(0); + CommandQueue& b = this->device_->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); +} + +TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileToL1Bank0) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1}; + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileToAllL1Banks) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + auto compute_with_storage_grid = device->compute_with_storage_grid_size(); + TestBufferConfig config = { + .num_pages = uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y), + .page_size = 2048, + .buftype = BufferType::L1}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + auto compute_with_storage_grid = device->compute_with_storage_grid_size(); + TestBufferConfig config = { + .num_pages = 2 * uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y), + .page_size = 2048, + .buftype = BufferType::L1}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + +TEST_F(MultiCommandQueueMultiDeviceBufferFixture, TestNon32BAlignedPageSizeForL1) { + for (Device *device : devices_) { + tt::log_info("Running On Device {}", device->id()); + TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1}; + + CommandQueue& a = device->command_queue(0); + CommandQueue& b = device->command_queue(1); + vector> cqs = {a, b}; + EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); + } +} + } // end namespace l1_tests -TEST_F(CommandQueueSingleCardFixture, TestNonblockingReads) { +TEST_F(CommandQueueSingleCardBufferFixture, TestNonblockingReads) { constexpr BufferType buff_type = BufferType::L1; for (auto device : devices_) { @@ -648,7 +1019,7 @@ namespace stress_tests { // TODO: Add stress test that vary page size -TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsBlocking) { +TEST_F(CommandQueueSingleCardBufferFixture, WritesToRandomBufferTypeAndThenReadsBlocking) { BufferStressTestConfig config = { .seed = 0, .num_pages_total = 50000, .page_size = 2048, .max_num_pages_per_buffer = 16}; @@ -659,7 +1030,7 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsBlocki } } -TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsNonblocking) { +TEST_F(CommandQueueSingleCardBufferFixture, WritesToRandomBufferTypeAndThenReadsNonblocking) { BufferStressTestConfig config = { .seed = 0, .num_pages_total = 50000, .page_size = 2048, .max_num_pages_per_buffer = 16}; @@ -672,7 +1043,7 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsNonblo } // TODO: Split this into separate tests -TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) { +TEST_F(CommandQueueSingleCardBufferFixture, ShardedBufferL1ReadWrites) { std::map>> test_params; for (Device *device : devices_) { @@ -726,7 +1097,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) { } } -TEST_F(CommandQueueSingleCardFixture, ShardedBufferDRAMReadWrites) { +TEST_F(CommandQueueSingleCardBufferFixture, ShardedBufferDRAMReadWrites) { for (Device *device : devices_) { for (const std::array cores : {std::array{1, 1}, @@ -784,7 +1155,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferDRAMReadWrites) { } } -TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeL1ReadWrites) { +TEST_F(CommandQueueSingleCardBufferFixture, ShardedBufferLargeL1ReadWrites) { for (Device *device : devices_) { for (const std::array cores : {std::array{1, 1}, @@ -826,7 +1197,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeL1ReadWrites) { } } -TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeDRAMReadWrites) { +TEST_F(CommandQueueSingleCardBufferFixture, ShardedBufferLargeDRAMReadWrites) { for (Device *device : devices_) { for (const std::array cores : {std::array{1, 1}, @@ -880,7 +1251,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeDRAMReadWrites) { } } -TEST_F(CommandQueueFixture, StressWrapTest) { +TEST_F(CommandQueueSingleCardBufferFixture, StressWrapTest) { const char *arch = getenv("ARCH_NAME"); if (strcasecmp(arch, "wormhole_b0") == 0) { tt::log_info("cannot run this test on WH B0"); @@ -890,8 +1261,10 @@ TEST_F(CommandQueueFixture, StressWrapTest) { BufferStressTestConfig config = { .page_size = 4096, .max_num_pages_per_buffer = 2000, .num_iterations = 10000, .num_unique_vectors = 20}; - EXPECT_TRUE(local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap( - this->device_, this->device_->command_queue(), config)); + for (Device *device : devices_) { + EXPECT_TRUE(local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap( + device, device->command_queue(), config)); + } } } // end namespace stress_tests diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp new file mode 100644 index 00000000000..59a45d5e490 --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp @@ -0,0 +1,108 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "tt_metal/common/core_coord.hpp" +#include "tt_metal/impl/buffers/global_semaphore.hpp" +#include "tt_metal/impl/device/device.hpp" +#include "tt_metal/impl/event/event.hpp" +#include "tt_metal/impl/sub_device/sub_device.hpp" +#include "tt_metal/test_utils/stimulus.hpp" +#include "command_queue_fixture.hpp" + +TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) { + uint32_t local_l1_size = 3200; + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); + SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); + CoreRangeSet sharded_cores_1 = CoreRange({0, 0}, {2, 2}); + CoreRangeSet sharded_cores_2 = CoreRange({4, 4}, {4, 4}); + + auto sharded_cores_1_vec = corerange_to_cores(sharded_cores_1, std::nullopt, true); + auto sharded_cores_2_vec = corerange_to_cores(sharded_cores_2, std::nullopt, true); + + ShardSpecBuffer shard_spec_buffer_1 = ShardSpecBuffer(sharded_cores_1, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_1.num_cores(), 1}); + uint32_t page_size_1 = 32; + ShardedBufferConfig shard_config_1 = {nullptr, sharded_cores_1.num_cores() * page_size_1, page_size_1, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_1}; + auto input_1 = tt::test_utils::generate_uniform_random_vector(0, 100, shard_config_1.size / sizeof(uint32_t)); + + ShardSpecBuffer shard_spec_buffer_2 = ShardSpecBuffer(sharded_cores_2, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_2.num_cores(), 1}); + uint32_t page_size_2 = 64; + ShardedBufferConfig shard_config_2 = {nullptr, sharded_cores_2.num_cores() * page_size_2, page_size_2, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_2}; + auto input_2 = tt::test_utils::generate_uniform_random_vector(0, 100, shard_config_2.size / sizeof(uint32_t)); + + uint32_t page_size_3 = 1024; + InterleavedBufferConfig interleaved_config = {nullptr, page_size_3, page_size_3, BufferType::L1, TensorMemoryLayout::INTERLEAVED}; + auto input_3 = tt::test_utils::generate_uniform_random_vector(0, 100, interleaved_config.size / sizeof(uint32_t)); + + for (Device *device : devices_) { + auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1}, local_l1_size); + auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_1, sub_device_2}, local_l1_size); + DeviceAddr l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1); + DeviceAddr max_addr = l1_unreserved_base + local_l1_size; + + shard_config_1.device = device; + shard_config_2.device = device; + interleaved_config.device = device; + + std::vector physical_cores_1; + physical_cores_1.reserve(sharded_cores_1_vec.size()); + for (const auto& core : sharded_cores_1_vec) { + physical_cores_1.push_back(device->worker_core_from_logical_core(core)); + } + + std::vector physical_cores_2; + physical_cores_2.reserve(sharded_cores_2_vec.size()); + for (const auto& core : sharded_cores_2_vec) { + physical_cores_2.push_back(device->worker_core_from_logical_core(core)); + } + + device->load_sub_device_manager(sub_device_manager_1); + + auto buffer_1 = CreateBuffer(shard_config_1, SubDeviceId{0}); + EXPECT_EQ(buffer_1->address(), max_addr - buffer_1->aligned_page_size()); + EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, false); + std::vector output_1; + EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true); + EXPECT_EQ(input_1, output_1); + auto input_1_it = input_1.begin(); + for (const auto& physical_core : physical_cores_1) { + auto readback = tt::llrt::read_hex_vec_from_core( + device->id(), physical_core, buffer_1->address(), page_size_1); + EXPECT_TRUE(std::equal(input_1_it, input_1_it + page_size_1 / sizeof(uint32_t), readback.begin())); + input_1_it += page_size_1 / sizeof(uint32_t); + } + + auto buffer_2 = CreateBuffer(interleaved_config); + EXPECT_THROW(CreateBuffer(shard_config_1, SubDeviceId{1}), std::exception); + EXPECT_THROW(device->clear_loaded_sub_device_manager(), std::exception); + EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception); + DeallocateBuffer(*buffer_1); + device->clear_loaded_sub_device_manager(); + device->load_sub_device_manager(sub_device_manager_2); + + auto buffer_3 = CreateBuffer(shard_config_2, SubDeviceId{1}); + EXPECT_EQ(buffer_3->address(), max_addr - buffer_3->aligned_page_size()); + EnqueueWriteBuffer(device->command_queue(), buffer_3, input_2, false); + std::vector output_2; + EnqueueReadBuffer(device->command_queue(), buffer_3, output_2, true); + EXPECT_EQ(input_2, output_2); + auto input_2_it = input_2.begin(); + for (const auto& physical_core : physical_cores_2) { + auto readback = tt::llrt::read_hex_vec_from_core( + device->id(), physical_core, buffer_3->address(), page_size_2); + EXPECT_TRUE(std::equal(input_2_it, input_2_it + page_size_2 / sizeof(uint32_t), readback.begin())); + input_2_it += page_size_2 / sizeof(uint32_t); + } + + auto buffer_4 = CreateBuffer(shard_config_1, SubDeviceId{0}); + EXPECT_EQ(buffer_4->address(), max_addr - buffer_4->aligned_page_size()); + EXPECT_THROW(CreateBuffer(interleaved_config, SubDeviceId{0}), std::exception); + } +} diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_event/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_event/CMakeLists.txt new file mode 100644 index 00000000000..0db070a4ba0 --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_event/CMakeLists.txt @@ -0,0 +1,34 @@ +set(UNIT_TESTS_DISPATCH_EVENT_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueWaitForEvent.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_events.cpp +) + +add_library(unit_tests_dispatch_event_o STATIC ${UNIT_TESTS_DISPATCH_EVENT_SRC}) + +target_link_libraries(unit_tests_dispatch_event_o PRIVATE test_metal_common_libs) + +target_include_directories( + unit_tests_dispatch_event_o + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) + +add_executable(unit_tests_dispatch_event $) + +target_link_libraries(unit_tests_dispatch_event PRIVATE test_metal_common_libs) + +set_target_properties( + unit_tests_dispatch_event + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) + +TT_ENABLE_UNITY_BUILD(unit_tests_dispatch_event) diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_EnqueueWaitForEvent.cpp similarity index 96% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp rename to tests/tt_metal/tt_metal/dispatch/dispatch_event/test_EnqueueWaitForEvent.cpp index 1c08c86fa15..52df4497afa 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_EnqueueWaitForEvent.cpp @@ -4,12 +4,11 @@ #include -#include "command_queue_fixture.hpp" +#include "multi_command_queue_fixture.hpp" #include "tt_metal/common/logger.hpp" #include "gtest/gtest.h" #include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "command_queue_test_utils.hpp" +#include "dispatch_test_utils.hpp" #include "tt_metal/impl/event/event.hpp" #include "tt_metal/impl/device/device.hpp" @@ -23,14 +22,12 @@ void FinishAllCqs(vector>& cqs) { Finish(cqs[i]); } } - - } namespace basic_tests { // Simplest test to record Event per CQ and wait from host, and verify populated Event struct is correct (many events, wrap issue queue) -TEST_F(MultiCommandQueueMultiDeviceFixture, TestEventsEventSynchronizeSanity) { +TEST_F(MultiCommandQueueMultiDeviceEventFixture, TestEventsEventSynchronizeSanity) { for (Device *device : devices_) { tt::log_info("Running On Device {}", device->id()); vector> cqs = {device->command_queue(0), device->command_queue(1)}; @@ -70,7 +67,7 @@ TEST_F(MultiCommandQueueMultiDeviceFixture, TestEventsEventSynchronizeSanity) { } // Simplest test to record Event per CQ and wait from host, and verify populated Event struct is correct (many events, wrap issue queue) -TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEventSynchronizeSanity) { +TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsEventSynchronizeSanity) { vector> cqs = {this->device_->command_queue(0), this->device_->command_queue(1)}; vector cmds_issued_per_cq = {0, 0}; @@ -108,7 +105,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEventSynchronizeSanity) { } // Simplest test to record and wait-for-events on same CQ. -TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEnqueueWaitForEventSanity) { +TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsEnqueueWaitForEventSanity) { vector> cqs = {this->device_->command_queue(0), this->device_->command_queue(1)}; vector events_issued_per_cq = {0, 0}; size_t num_events = 10; @@ -136,7 +133,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEnqueueWaitForEventSanity // Record event on one CQ, wait-for-that-event on another CQ. Then do the flip. Occasionally insert // syncs from Host per CQ, and verify completion queues per CQ are correct. -TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEnqueueWaitForEventCrossCQs) { +TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsEnqueueWaitForEventCrossCQs) { vector> cqs = {this->device_->command_queue(0), this->device_->command_queue(1)}; vector cmds_issued_per_cq = {0, 0}; const size_t num_events_per_cq = 10; @@ -200,7 +197,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEnqueueWaitForEventCrossC // Simple 2CQ test to mix reads, writes, record-event, wait-for-event in a basic way. It's simple because // the write, record-event, wait-event, read-event are all on the same CQ, but cover both CQ's. -TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEventSameCQ) { +TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsReadWriteWithWaitForEventSameCQ) { TestBufferConfig config = {.num_pages = 1, .page_size = 256, .buftype = BufferType::DRAM}; vector> cqs = {this->device_->command_queue(0), this->device_->command_queue(1)}; vector cmds_issued_per_cq = {0, 0}; @@ -247,7 +244,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent // More interesting test where Blocking ReadBuffer, Non-Blocking WriteBuffer are on alternate CQs, // ordered via events. Do many loops, occasionally increasing size of buffers (page size, num pages). // Ensure read back data is correct, data is different for each write. -TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEventCrossCQs) { +TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsReadWriteWithWaitForEventCrossCQs) { if (tt::Cluster::instance().arch() == tt::ARCH::GRAYSKULL) { GTEST_SKIP() << "Skipping for GS due to readback mismatch under debug Github issue #6281 "; } @@ -307,7 +304,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent // 2 CQs with single Buffer, and a loop where each iteration has non-blocking Write to Buffer via CQ0 and non-blocking Read // to Bufffer via CQ1. Ping-Pongs between Writes and Reads to same buffer. Use events to synchronze read after write and // write after read before checking correct data read at the end after all cmds finished on device. -TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEventCrossCQsPingPong) { +TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsReadWriteWithWaitForEventCrossCQsPingPong) { if (tt::Cluster::instance().arch() == tt::ARCH::GRAYSKULL) { GTEST_SKIP() << "Skipping for GS due to readback mismatch under debug Github issue #6281 "; } diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp rename to tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp index 023462a6cd2..6f26024a085 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp @@ -3,9 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "command_queue_fixture.hpp" -#include "command_queue_test_utils.hpp" #include "gtest/gtest.h" -#include "tt_metal/common/bfloat16.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "impl/debug/watcher_server.hpp" @@ -15,15 +13,15 @@ using std::vector; using namespace tt::tt_metal; +constexpr uint32_t completion_queue_event_offset = sizeof(CQDispatchCmd); +constexpr uint32_t completion_queue_page_size = dispatch_constants::TRANSFER_PAGE_SIZE; + enum class DataMovementMode: uint8_t { WRITE = 0, READ = 1 }; -constexpr uint32_t completion_queue_event_offset = sizeof(CQDispatchCmd); -constexpr uint32_t completion_queue_page_size = dispatch_constants::TRANSFER_PAGE_SIZE; - -TEST_F(CommandQueueFixture, TestEventsDataMovementWrittenToCompletionQueueInOrder) { +TEST_F(CommandQueueEventFixture, TestEventsDataMovementWrittenToCompletionQueueInOrder) { size_t num_buffers = 100; uint32_t page_size = 2048; vector page(page_size / sizeof(uint32_t)); @@ -75,7 +73,7 @@ TEST_F(CommandQueueFixture, TestEventsDataMovementWrittenToCompletionQueueInOrde } // Basic test, record events, check that Event struct was updated. Enough commands to trigger issue queue wrap. -TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventIssueQueueWrap) { +TEST_F(CommandQueueEventFixture, TestEventsEnqueueRecordEventIssueQueueWrap) { const size_t num_events = 100000; // Enough to wrap issue queue. 768MB and cmds are 22KB each, so 35k cmds. uint32_t cmds_issued_per_cq = 0; @@ -96,7 +94,7 @@ TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventIssueQueueWrap) { } // Test where Host synchronously waits for event to be completed. -TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronize) { +TEST_F(CommandQueueEventFixture, TestEventsEnqueueRecordEventAndSynchronize) { const size_t num_events = 100; const size_t num_events_between_sync = 10; @@ -128,7 +126,7 @@ TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronize) { // Negative test. Host syncing on a future event that isn't actually issued. // Ensure that expected hang is seen, which indicates event sync feature is working properly. -TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronizeHang) { +TEST_F(CommandQueueEventFixture, TestEventsEnqueueRecordEventAndSynchronizeHang) { tt::llrt::OptionsG.set_test_mode_enabled(true); // Required for finish hang breakout. auto future_event = std::make_shared(); @@ -155,7 +153,7 @@ TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronizeHang) { // Negative test. Device sync. Single CQ here syncing on a future event that isn't actually issued. // Ensure that expected hang is seen, which indicates event sync feature is working properly. -TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventHang) { +TEST_F(CommandQueueEventFixture, TestEventsQueueWaitForEventHang) { // Skip this test until #7216 is implemented. GTEST_SKIP(); tt::llrt::OptionsG.set_test_mode_enabled(true); // Required for finish hang breakout. @@ -183,7 +181,7 @@ TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventHang) { } // Device sync. Single CQ here, less interesting than 2CQ but still useful. Ensure no hangs. -TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventBasic) { +TEST_F(CommandQueueEventFixture, TestEventsQueueWaitForEventBasic) { const size_t num_events = 50; const size_t num_events_between_sync = 5; @@ -214,7 +212,7 @@ TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventBasic) { } // Device sync. Single CQ here, less interesting than 2CQ but still useful. Ensure no hangs. -TEST_F(CommandQueueFixture, TestEventsEventsQueryBasic) { +TEST_F(CommandQueueEventFixture, TestEventsEventsQueryBasic) { const size_t num_events = 50; const size_t num_events_between_query = 5; @@ -260,7 +258,7 @@ TEST_F(CommandQueueFixture, TestEventsEventsQueryBasic) { // Mix of WritesBuffers, RecordEvent, WaitForEvent, EventSynchronize with some checking. -TEST_F(CommandQueueFixture, TestEventsMixedWriteBufferRecordWaitSynchronize) { +TEST_F(CommandQueueEventFixture, TestEventsMixedWriteBufferRecordWaitSynchronize) { const size_t num_buffers = 2; const uint32_t page_size = 2048; vector page(page_size / sizeof(uint32_t)); diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_program/CMakeLists.txt new file mode 100644 index 00000000000..261109184f8 --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/CMakeLists.txt @@ -0,0 +1,36 @@ +set(UNIT_TESTS_DISPATCH_PROGRAM_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch_stress.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueProgram.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sub_device.cpp +) + +add_library(unit_tests_dispatch_program_o STATIC ${UNIT_TESTS_DISPATCH_PROGRAM_SRC}) + +target_link_libraries(unit_tests_dispatch_program_o PRIVATE test_metal_common_libs) + +target_include_directories( + unit_tests_dispatch_program_o + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) + +add_executable(unit_tests_dispatch_program $) + +target_link_libraries(unit_tests_dispatch_program PRIVATE test_metal_common_libs) + +set_target_properties( + unit_tests_dispatch_program + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) + +TT_ENABLE_UNITY_BUILD(unit_tests_dispatch_program) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/program_with_kernel_created_from_string_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/program_with_kernel_created_from_string_fixture.hpp new file mode 100644 index 00000000000..b15f3cc8deb --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/program_with_kernel_created_from_string_fixture.hpp @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "dispatch_fixture.hpp" + +class ProgramWithKernelCreatedFromStringFixture : public DispatchFixture { + protected: + void SetUp() override { + DispatchFixture::SetUp(); + for (Device *device : this->devices_) + { + const chip_id_t device_id = device->id(); + this->device_ids_to_devices_[device_id] = device; + } + } + + void TearDown() override { + detail::CloseDevices(this->device_ids_to_devices_); + } + + private: + std::map device_ids_to_devices_; +}; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp similarity index 80% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp rename to tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp index 90a7b9221a3..6e3127e8471 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp @@ -4,18 +4,17 @@ #include #include -#include #include "command_queue_fixture.hpp" -#include "command_queue_test_utils.hpp" +#include "multi_command_queue_fixture.hpp" +#include "random_program_fixture.hpp" +#include "dispatch_test_utils.hpp" #include "gtest/gtest.h" #include "impl/buffers/buffer.hpp" #include "impl/device/device.hpp" #include "impl/kernels/kernel_types.hpp" -#include "tt_metal/common/bfloat16.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/impl/kernels/kernel.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp" #include "tt_soc_descriptor.h" using std::vector; @@ -41,7 +40,6 @@ struct DummyProgramMultiCBConfig { uint32_t num_sems; }; - namespace local_test_functions { void initialize_dummy_kernels(Program& program, const CoreRangeSet& cr_set) { @@ -114,6 +112,41 @@ bool cb_config_successful(Device* device, Program &program, const DummyProgramMu return pass; } +bool test_dummy_EnqueueProgram_with_runtime_args(Device* device, const CoreCoord& eth_core_coord) { + Program program; + bool pass = true; + auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_core_coord); + + constexpr uint32_t num_runtime_args0 = 9; + constexpr uint32_t rta_base0 = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + std::map dummy_defines0 = {{"DATA_MOVEMENT", "1"}, + {"NUM_RUNTIME_ARGS", std::to_string(num_runtime_args0)}, + {"RESULTS_ADDR", std::to_string(rta_base0)}}; + auto dummy_kernel0 = CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/misc/runtime_args_kernel.cpp", + eth_core_coord, + tt::tt_metal::EthernetConfig{.noc = tt::tt_metal::NOC::NOC_0, .defines = dummy_defines0}); + + vector dummy_kernel0_args = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + tt::tt_metal::SetRuntimeArgs(program, dummy_kernel0, eth_core_coord, dummy_kernel0_args); + + tt::tt_metal::detail::CompileProgram(device, program); + auto& cq = device->command_queue(); + EnqueueProgram(cq, program, false); + Finish(cq); + + vector dummy_kernel0_args_readback = tt::llrt::read_hex_vec_from_core( + device->id(), + eth_noc_xy, + eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, + dummy_kernel0_args.size() * sizeof(uint32_t)); + + pass &= (dummy_kernel0_args == dummy_kernel0_args_readback); + + return pass; +} + bool test_dummy_EnqueueProgram_with_cbs(Device* device, CommandQueue& cq, DummyProgramMultiCBConfig& program_config) { Program program; @@ -651,7 +684,7 @@ namespace basic_tests { namespace compiler_workaround_hardware_bug_tests { -TEST_F(CommandQueueSingleCardFixture, TestArbiterDoesNotHang) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestArbiterDoesNotHang) { for (Device *device : devices_) { Program program; @@ -669,7 +702,7 @@ TEST_F(CommandQueueSingleCardFixture, TestArbiterDoesNotHang) { } namespace single_core_tests { -TEST_F(CommandQueueSingleCardFixture, TestSingleCbConfigCorrectlySentSingleCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestSingleCbConfigCorrectlySentSingleCore) { CoreRange cr({0, 0}, {0, 0}); CoreRangeSet cr_set({cr}); @@ -682,7 +715,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSingleCbConfigCorrectlySentSingleCore) } } -TEST_F(CommandQueueSingleCardFixture, TestMultiCbSeqConfigCorrectlySentSingleCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCbSeqConfigCorrectlySentSingleCore) { CoreRange cr({0, 0}, {0, 0}); CoreRangeSet cr_set({cr}); @@ -699,7 +732,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCbSeqConfigCorrectlySentSingleCor } } -TEST_F(CommandQueueSingleCardFixture, TestMultiCbRandomConfigCorrectlySentSingleCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCbRandomConfigCorrectlySentSingleCore) { CoreRange cr({0, 0}, {0, 0}); CoreRangeSet cr_set({cr}); @@ -716,7 +749,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCbRandomConfigCorrectlySentSingle } } -TEST_F(CommandQueueSingleCardFixture, TestMultiCBSharedAddressSpaceSentSingleCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCBSharedAddressSpaceSentSingleCore) { CoreRange cr({0, 0}, {0, 0}); CoreRangeSet cr_set({cr}); @@ -770,7 +803,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCBSharedAddressSpaceSentSingleCor } } -TEST_F(CommandQueueSingleCardFixture, TestSingleCbConfigCorrectlyUpdateSizeSentSingleCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestSingleCbConfigCorrectlyUpdateSizeSentSingleCore) { CoreRange cr({0, 0}, {0, 0}); CoreRangeSet cr_set({cr}); @@ -783,7 +816,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSingleCbConfigCorrectlyUpdateSizeSentS } } -TEST_F(CommandQueueSingleCardFixture, TestSingleSemaphoreConfigCorrectlySentSingleCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestSingleSemaphoreConfigCorrectlySentSingleCore) { CoreRange cr({0, 0}, {0, 0}); CoreRangeSet cr_set({cr}); @@ -794,7 +827,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSingleSemaphoreConfigCorrectlySentSing } } -TEST_F(CommandQueueSingleCardFixture, TestAutoInsertedBlankBriscKernelInDeviceDispatchMode) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAutoInsertedBlankBriscKernelInDeviceDispatchMode) { for (Device *device : devices_) { Program program; @@ -812,7 +845,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAutoInsertedBlankBriscKernelInDeviceDi } // Sanity test for setting and verifying common and unique runtime args to a single core, the simplest case. -TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanitySingleCoreCompute) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanitySingleCoreCompute) { CoreRange cr0({0, 0}, {0, 0}); CoreRangeSet cr_set({cr0}); DummyProgramConfig dummy_program_config = {.cr_set = cr_set}; @@ -821,8 +854,16 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanitySingleCoreComput } } +TEST_F(CommandQueueSingleCardProgramFixture, ActiveEthEnqueueDummyProgram) { + for (const auto& device : devices_) { + for (const auto& eth_core : device->get_active_ethernet_cores(true)) { + ASSERT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(device, eth_core)); + } + } +} + // Sanity test for setting and verifying common and unique runtime args to single cores via ERISC. Some arch may return 0 active eth cores, that's okay. -TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanitySingleCoreDataMovementErisc) { +TEST_F(CommandQueueSingleCardProgramFixture, ActiveEthIncrementRuntimeArgsSanitySingleCoreDataMovementErisc) { for (Device *device : devices_) { for (const auto ð_core : device->get_active_ethernet_cores(true)) { CoreRange cr0(eth_core); @@ -836,7 +877,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanitySingleCoreDataMo // Sanity test for setting and verifying common and unique runtime args to single cores via ERISC(IDLE). Some arch may return 0 active eth cores, that's okay. // FIXME - Re-enable when FD-on-idle-eth is supported -TEST_F(CommandQueueSingleCardFixture, DISABLED_IncrementRuntimeArgsSanitySingleCoreDataMovementEriscIdle) { +TEST_F(CommandQueueSingleCardProgramFixture, DISABLED_ActiveEthIncrementRuntimeArgsSanitySingleCoreDataMovementEriscIdle) { for (Device *device : devices_) { for (const auto ð_core : device->get_active_ethernet_cores(true)) { CoreRange cr0(eth_core); @@ -850,7 +891,7 @@ TEST_F(CommandQueueSingleCardFixture, DISABLED_IncrementRuntimeArgsSanitySingleC // Sanity test for setting and verifying common and unique runtime args to single cores via inactive ERISC cores. Some arch may return 0 active eth cores, that's okay. // FIXME - Re-enable when FD-on-idle-eth is supported -TEST_F(CommandQueueSingleCardFixture, DISABLED_IncrementRuntimeArgsSanitySingleCoreDataMovementEriscInactive) { +TEST_F(CommandQueueSingleCardProgramFixture, DISABLED_IdleEthIncrementRuntimeArgsSanitySingleCoreDataMovementEriscInactive) { for (Device *device : devices_) { for (const auto ð_core : device->get_inactive_ethernet_cores()) { CoreRange cr0(eth_core); @@ -862,7 +903,7 @@ TEST_F(CommandQueueSingleCardFixture, DISABLED_IncrementRuntimeArgsSanitySingleC } } -TEST_F(CommandQueueSingleCardFixture, TestRuntimeArgsCorrectlySentSingleCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestRuntimeArgsCorrectlySentSingleCore) { CoreRange cr({0, 0}, {0, 0}); CoreRangeSet cr_set({cr}); @@ -876,7 +917,7 @@ TEST_F(CommandQueueSingleCardFixture, TestRuntimeArgsCorrectlySentSingleCore) { } // end namespace single_core_tests namespace multicore_tests { -TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentMultiCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllCbConfigsCorrectlySentMultiCore) { CBConfig cb_config = {.num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b}; std::vector cb_config_vector(NUM_CIRCULAR_BUFFERS, cb_config); @@ -896,7 +937,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentMultiCore) { } } -TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentUpdateSizeMultiCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllCbConfigsCorrectlySentUpdateSizeMultiCore) { CBConfig cb_config = {.num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b}; std::vector cb_config_vector(NUM_CIRCULAR_BUFFERS, cb_config); @@ -916,7 +957,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentUpdateSizeMul } } -TEST_F(CommandQueueSingleCardFixture, TestMultiCbConfigsCorrectlySentUpdateSizeMultiCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCbConfigsCorrectlySentUpdateSizeMultiCore) { CBConfig cb_config_0 = {.cb_id = 0, .num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b}; CBConfig cb_config_1 = {.cb_id = 1, .num_pages = 2, .page_size = 4096, .data_format = tt::DataFormat::Float16_b}; CBConfig cb_config_2 = {.cb_id = 2, .num_pages = 2, .page_size = 2048, .data_format = tt::DataFormat::Float16_b}; @@ -937,7 +978,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCbConfigsCorrectlySentUpdateSizeM } } -TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentMultipleCoreRanges) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllCbConfigsCorrectlySentMultipleCoreRanges) { CBConfig cb_config = {.num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b}; std::vector cb_config_vector(NUM_CIRCULAR_BUFFERS, cb_config); @@ -958,7 +999,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentMultipleCoreR } } -TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentUpdateSizeMultipleCoreRanges) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllCbConfigsCorrectlySentUpdateSizeMultipleCoreRanges) { CBConfig cb_config = {.num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b}; std::vector cb_config_vector(NUM_CIRCULAR_BUFFERS, cb_config); @@ -979,7 +1020,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentUpdateSizeMul } } -TEST_F(CommandQueueSingleCardFixture, TestMultiCbConfigsCorrectlySentUpdateSizeMultipleCoreRanges) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCbConfigsCorrectlySentUpdateSizeMultipleCoreRanges) { CBConfig cb_config_0 = {.cb_id = 0, .num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b}; CBConfig cb_config_1 = {.cb_id = 1, .num_pages = 2, .page_size = 4096, .data_format = tt::DataFormat::Float16_b}; CBConfig cb_config_2 = {.cb_id = 2, .num_pages = 2, .page_size = 2048, .data_format = tt::DataFormat::Float16_b}; @@ -1001,7 +1042,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCbConfigsCorrectlySentUpdateSizeM } } -TEST_F(CommandQueueSingleCardFixture, TestAllSemConfigsCorrectlySentMultiCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllSemConfigsCorrectlySentMultiCore) { for (Device *device : devices_) { CoreCoord worker_grid_size = device->compute_with_storage_grid_size(); @@ -1014,7 +1055,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllSemConfigsCorrectlySentMultiCore) { } } -TEST_F(CommandQueueSingleCardFixture, TestAllSemaphoreConfigsCorrectlySentMultipleCoreRanges) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllSemaphoreConfigsCorrectlySentMultipleCoreRanges) { for (Device *device : devices_) { CoreRange first_cr({0, 0}, {1, 1}); @@ -1054,7 +1095,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllSemaphoreConfigsCorrectlySentMultip } } -TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllRuntimeArgsCorrectlySentMultiCore) { for (Device *device : devices_) { CoreCoord worker_grid_size = device->compute_with_storage_grid_size(); @@ -1066,7 +1107,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore) } } -TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore_255_PerKernel) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllRuntimeArgsCorrectlySentMultiCore_255_PerKernel) { for (Device *device : devices_) { CoreCoord worker_grid_size = device->compute_with_storage_grid_size(); @@ -1078,7 +1119,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore_2 } } -TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiCoreRange) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestSendRuntimeArgsMultiCoreRange) { for (Device* device : devices_) { CoreCoord worker_grid_size = device->compute_with_storage_grid_size(); @@ -1092,7 +1133,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiCoreRange) { } } -TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiNonOverlappingCoreRange) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestSendRuntimeArgsMultiNonOverlappingCoreRange) { // Core ranges get merged in kernel groups, this one does not for (Device* device : devices_) { CoreCoord worker_grid_size = device->compute_with_storage_grid_size(); @@ -1107,7 +1148,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiNonOverlappingCore } } -TEST_F(CommandQueueSingleCardFixture, TestUpdateRuntimeArgsMultiCoreRange) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixTestUpdateRuntimeArgsMultiCoreRange) { for (Device* device : devices_) { CoreCoord worker_grid_size = device->compute_with_storage_grid_size(); @@ -1122,7 +1163,7 @@ TEST_F(CommandQueueSingleCardFixture, TestUpdateRuntimeArgsMultiCoreRange) { } // Sanity test for setting and verifying common and unique runtime args to multiple cores. -TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanityMultiCoreCompute) { CoreRange cr0({1, 1}, {2, 2}); CoreRange cr1({3, 3}, {4, 4}); CoreRangeSet cr_set(std::vector{cr0, cr1}); @@ -1133,7 +1174,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute } // Max number of 255 unique RT args. -TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute_255_UniqueArgs) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanityMultiCoreCompute_255_UniqueArgs) { CoreRange cr0({1, 1}, {2, 2}); CoreRange cr1({3, 3}, {4, 4}); CoreRangeSet cr_set(std::vector{cr0, cr1}); @@ -1144,7 +1185,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute } // Max number of 255 common RT args. -TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute_255_CommonArgs) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanityMultiCoreCompute_255_CommonArgs) { CoreRange cr0({1, 1}, {2, 2}); CoreRange cr1({3, 3}, {4, 4}); CoreRangeSet cr_set(std::vector{cr0, cr1}); @@ -1155,7 +1196,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute } // Sanity test for setting and verifying common and unique runtime args to multiple cores via BRISC. -TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMovementBrisc) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanityMultiCoreDataMovementBrisc) { CoreRange cr0({1, 1}, {2, 2}); CoreRange cr1({3, 3}, {4, 4}); CoreRangeSet cr_set(std::vector{cr0, cr1}); @@ -1166,7 +1207,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMov } // Sanity test for setting and verifying common and unique runtime args to multiple cores via NCRISC. -TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMovementNcrisc) { +TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanityMultiCoreDataMovementNcrisc) { CoreRange cr0({1, 1}, {2, 2}); CoreRange cr1({3, 3}, {4, 4}); CoreRangeSet cr_set(std::vector{cr0, cr1}); @@ -1182,8 +1223,198 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMov namespace stress_tests { +TEST_F(MultiCommandQueueSingleDeviceProgramFixture, TensixTestRandomizedProgram) { + uint32_t NUM_PROGRAMS = 100; + uint32_t MAX_LOOP = 100; + uint32_t page_size = 1024; + + if (this->arch_ == tt::ARCH::BLACKHOLE) { + GTEST_SKIP(); // Running on second CQ is hanging on CI + } + + // Make random + auto random_seed = 0; // (unsigned int)time(NULL); + uint32_t seed = tt::parse_env("TT_METAL_SEED", random_seed); + log_info(tt::LogTest, "Using Test Seed: {}", seed); + srand(seed); + + CoreCoord worker_grid_size = this->device_->compute_with_storage_grid_size(); + CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1}); + CoreRangeSet cr_set({cr}); + + log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS); + + vector programs; + for (uint32_t i = 0; i < NUM_PROGRAMS; i++) { + programs.push_back(Program()); + Program& program = programs.back(); + + std::map data_movement_defines = {{"DATA_MOVEMENT", "1"}}; + std::map compute_defines = {{"COMPUTE", "1"}}; + + // brisc + uint32_t BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS; + bool USE_MAX_RT_ARGS; + + if (i == 0) { + // Ensures that we get at least one compilation with the max amount to + // ensure it compiles and runs + BRISC_OUTER_LOOP = MAX_LOOP; + BRISC_MIDDLE_LOOP = MAX_LOOP; + BRISC_INNER_LOOP = MAX_LOOP; + NUM_CBS = NUM_CIRCULAR_BUFFERS; + NUM_SEMS = NUM_SEMAPHORES; + USE_MAX_RT_ARGS = true; + } else { + BRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; + BRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; + BRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; + NUM_CBS = rand() % (NUM_CIRCULAR_BUFFERS) + 1; + NUM_SEMS = rand() % (NUM_SEMAPHORES) + 1; + USE_MAX_RT_ARGS = false; + } + + log_debug(tt::LogTest, "Compiling program {}/{} w/ BRISC_OUTER_LOOP: {} BRISC_MIDDLE_LOOP: {} BRISC_INNER_LOOP: {} NUM_CBS: {} NUM_SEMS: {} USE_MAX_RT_ARGS: {}", + i+1, NUM_PROGRAMS, BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, USE_MAX_RT_ARGS); + + for (uint32_t j = 0; j < NUM_CBS; j++) { + CircularBufferConfig cb_config = CircularBufferConfig(page_size * (j + 1), {{j, tt::DataFormat::Float16_b}}).set_page_size(j, page_size * (j + 1)); + auto cb = CreateCircularBuffer(program, cr_set, cb_config); + } + + for (uint32_t j = 0; j < NUM_SEMS; j++) { + CreateSemaphore(program, cr_set, j + 1); + } + + auto [brisc_unique_rtargs, brisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS); + uint32_t num_brisc_unique_rtargs = brisc_unique_rtargs.size(); + uint32_t num_brisc_common_rtargs = brisc_common_rtargs.size(); + vector brisc_compile_args = {BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_brisc_unique_rtargs, num_brisc_common_rtargs, page_size}; + + // ncrisc + uint32_t NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP; + if (i == 0) { + NCRISC_OUTER_LOOP = MAX_LOOP; + NCRISC_MIDDLE_LOOP = MAX_LOOP; + NCRISC_INNER_LOOP = MAX_LOOP; + } else { + NCRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; + NCRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; + NCRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; + } + + auto [ncrisc_unique_rtargs, ncrisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS); + uint32_t num_ncrisc_unique_rtargs = ncrisc_unique_rtargs.size(); + uint32_t num_ncrisc_common_rtargs = ncrisc_common_rtargs.size(); + vector ncrisc_compile_args = {NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_ncrisc_unique_rtargs, num_ncrisc_common_rtargs, page_size}; + + // trisc + uint32_t TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP; + if (i == 0) { + TRISC_OUTER_LOOP = MAX_LOOP; + TRISC_MIDDLE_LOOP = MAX_LOOP; + TRISC_INNER_LOOP = MAX_LOOP; + } else { + TRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; + TRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; + TRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; + } + + auto [trisc_unique_rtargs, trisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS); + uint32_t num_trisc_unique_rtargs = trisc_unique_rtargs.size(); + uint32_t num_trisc_common_rtargs = trisc_common_rtargs.size(); + vector trisc_compile_args = {TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_trisc_unique_rtargs, num_trisc_common_rtargs, page_size}; + + bool at_least_one_kernel = false; + if (i == 0 or ((rand() % 2) == 0)) { + auto dummy_brisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs); + at_least_one_kernel = true; + } + + if (i == 0 or ((rand() % 2) == 0)) { + auto dummy_ncrisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs); + at_least_one_kernel = true; + } + + if (i == 0 or ((rand() % 2) == 0)) { + auto dummy_trisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{ + .math_approx_mode = false, + .compile_args = trisc_compile_args, + .defines = compute_defines + }); + SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs); + at_least_one_kernel = true; + } + + if (not at_least_one_kernel) { + uint32_t random_risc = rand() % 3 + 1; + if (random_risc == 1) { + auto dummy_brisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs); + } else if (random_risc == 2) { + auto dummy_ncrisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines}); + SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs); + } else if (random_risc == 3) { + auto dummy_trisc_kernel = CreateKernel( + program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{ + .math_approx_mode = false, + .compile_args = trisc_compile_args, + .defines = compute_defines + }); + SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs); + SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs); + } else { + TT_THROW("Invalid"); + } + } + + tt::tt_metal::detail::CompileProgram(this->device_, program); + } + + for (uint8_t cq_id = 0; cq_id < this->device_->num_hw_cqs(); ++cq_id) { + log_info(tt::LogTest, "Running {} programs on cq {} for cache warmup.", programs.size(), (uint32_t)cq_id); + // This loop caches program and runs + for (Program& program: programs) { + EnqueueProgram(this->device_->command_queue(cq_id), program, false); + } + + // This loops assumes already cached + uint32_t NUM_ITERATIONS = 500; // TODO(agrebenisan): Bump this to 5000, saw hangs for very large number of iterations, need to come back to that + + log_info(tt::LogTest, "Running {} programs on cq {} for {} iterations now.", programs.size(), (uint32_t)cq_id, NUM_ITERATIONS); + for (uint32_t i = 0; i < NUM_ITERATIONS; i++) { + auto rng = std::default_random_engine {}; + std::shuffle(std::begin(programs), std::end(programs), rng); + if (i % 10 == 0) { + log_debug(tt::LogTest, "Enqueueing {} programs on cq {} for iter: {}/{} now.", programs.size(), (uint32_t)cq_id, i+1, NUM_ITERATIONS); + } + for (Program& program: programs) { + EnqueueProgram(this->device_->command_queue(cq_id), program, false); + } + } + + log_info(tt::LogTest, "Calling Finish."); + Finish(this->device_->command_queue(cq_id)); + } +} -TEST_F(CommandQueueSingleCardFixture, DISABLED_TestFillDispatchCoreBuffer) { +TEST_F(CommandQueueSingleCardProgramFixture, DISABLED_TensixTestFillDispatchCoreBuffer) { uint32_t NUM_ITER = 100000; for (Device *device : devices_) { CoreCoord worker_grid_size = device->compute_with_storage_grid_size(); @@ -1197,7 +1428,7 @@ TEST_F(CommandQueueSingleCardFixture, DISABLED_TestFillDispatchCoreBuffer) { } } -TEST_F(CommandQueueFixture, TestRandomizedProgram) { +TEST_F(CommandQueueProgramFixture, TensixTestRandomizedProgram) { uint32_t NUM_PROGRAMS = 100; uint32_t MAX_LOOP = 100; uint32_t page_size = 1024; @@ -1386,7 +1617,7 @@ TEST_F(CommandQueueFixture, TestRandomizedProgram) { Finish(this->device_->command_queue()); } -TEST_F(RandomProgramFixture, TestSimpleProgramsOnTensix) { +TEST_F(RandomProgramFixture, TensixTestSimplePrograms) { for (uint32_t i = 0; i < NUM_PROGRAMS; i++) { if (i % 10 == 0) { log_info(tt::LogTest, "Creating Program {}", i); @@ -1399,7 +1630,7 @@ TEST_F(RandomProgramFixture, TestSimpleProgramsOnTensix) { Finish(device_->command_queue()); } -TEST_F(RandomProgramFixture, TestSimpleProgramsOnEth) { +TEST_F(RandomProgramFixture, ActiveEthTestSimplePrograms) { if (!does_device_have_active_eth_cores(device_)) { GTEST_SKIP() << "Skipping test because device " << device_->id() << " does not have any active ethernet cores"; } @@ -1416,7 +1647,7 @@ TEST_F(RandomProgramFixture, TestSimpleProgramsOnEth) { Finish(device_->command_queue()); } -TEST_F(RandomProgramFixture, TestSimpleProgramsOnTensixAndEth) { +TEST_F(RandomProgramFixture, TensixActiveEthTestSimplePrograms) { if (!does_device_have_active_eth_cores(device_)) { GTEST_SKIP() << "Skipping test because device " << device_->id() << " does not have any active ethernet cores"; } @@ -1442,7 +1673,7 @@ TEST_F(RandomProgramFixture, TestSimpleProgramsOnTensixAndEth) { Finish(device_->command_queue()); } -TEST_F(RandomProgramFixture, TestProgramsOnTensix) { +TEST_F(RandomProgramFixture, TensixTestPrograms) { for (uint32_t i = 0; i < NUM_PROGRAMS; i++) { if (i % 10 == 0) { log_info(tt::LogTest, "Creating Program {}", i); @@ -1455,7 +1686,7 @@ TEST_F(RandomProgramFixture, TestProgramsOnTensix) { Finish(device_->command_queue()); } -TEST_F(RandomProgramFixture, TestProgramsOnEth) { +TEST_F(RandomProgramFixture, ActiveEthTestPrograms) { if (!does_device_have_active_eth_cores(device_)) { GTEST_SKIP() << "Skipping test because device " << device_->id() << " does not have any active ethernet cores"; } @@ -1477,7 +1708,7 @@ TEST_F(RandomProgramFixture, TestProgramsOnEth) { Finish(device_->command_queue()); } -TEST_F(RandomProgramFixture, TestProgramsOnTensixAndEth) { +TEST_F(RandomProgramFixture, TensixActiveEthTestPrograms) { if (!does_device_have_active_eth_cores(device_)) { GTEST_SKIP() << "Skipping test because device " << device_->id() << " does not have any active ethernet cores"; } @@ -1511,7 +1742,7 @@ TEST_F(RandomProgramFixture, TestProgramsOnTensixAndEth) { Finish(device_->command_queue()); } -TEST_F(RandomProgramFixture, TestAlternatingLargeAndSmallProgramsOnTensix) { +TEST_F(RandomProgramFixture, TensixTestAlternatingLargeAndSmallPrograms) { for (uint32_t i = 0; i < NUM_PROGRAMS; i++) { if (i % 10 == 0) { log_info(tt::LogTest, "Creating Program {}", i); @@ -1532,7 +1763,7 @@ TEST_F(RandomProgramFixture, TestAlternatingLargeAndSmallProgramsOnTensix) { Finish(device_->command_queue()); } -TEST_F(RandomProgramFixture, TestLargeProgramFollowedBySmallProgramsOnTensix) { +TEST_F(RandomProgramFixture, TensixTestLargeProgramFollowedBySmallPrograms) { for (uint32_t i = 0; i < NUM_PROGRAMS; i++) { if (i % 10 == 0) { log_info(tt::LogTest, "Creating Program {}", i); @@ -1553,7 +1784,7 @@ TEST_F(RandomProgramFixture, TestLargeProgramFollowedBySmallProgramsOnTensix) { Finish(device_->command_queue()); } -TEST_F(RandomProgramFixture, TestLargeProgramInBetweenFiveSmallProgramsOnTensix) { +TEST_F(RandomProgramFixture, TensixTestLargeProgramInBetweenFiveSmallPrograms) { for (uint32_t i = 0; i < NUM_PROGRAMS; i++) { if (i % 10 == 0) { log_info(tt::LogTest, "Creating Program {}", i); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp rename to tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp index d8e3a4fefe1..1cd8a9a0d2d 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp @@ -4,13 +4,13 @@ // This file contains dispatch tests that are (generally) dispatch mode agnostic -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "dispatch_fixture.hpp" using std::vector; // Test sync w/ semaphores betweeen eth/tensix cores // Test will hang in the kernel if the sync doesn't work properly -static void test_sems_across_core_types(CommonFixture *fixture, +static void test_sems_across_core_types(DispatchFixture *fixture, vector& devices, bool active_eth) { // just something unique... @@ -89,7 +89,7 @@ static void test_sems_across_core_types(CommonFixture *fixture, } } -TEST_F(CommonFixture, TestEthBlank) { +TEST_F(DispatchFixture, EthTestBlank) { Device *device = devices_[0]; Program program = CreateProgram(); @@ -113,7 +113,7 @@ TEST_F(CommonFixture, TestEthBlank) { } } -TEST_F(CommonFixture, TestTensixInitLocalMemory) { +TEST_F(DispatchFixture, TensixTestInitLocalMemory) { // This test will hang/assert if there is a failure @@ -136,7 +136,7 @@ TEST_F(CommonFixture, TestTensixInitLocalMemory) { this->RunProgram(device, program); } -TEST_F(CommonFixture, TestEthInitLocalMemory) { +TEST_F(DispatchFixture, EthTestInitLocalMemory) { // This test will hang/assert if there is a failure @@ -167,11 +167,11 @@ TEST_F(CommonFixture, TestEthInitLocalMemory) { } } -TEST_F(CommonFixture, TestSemaphoresTensixActiveEth) { +TEST_F(DispatchFixture, TensixActiveEthTestSemaphores) { test_sems_across_core_types(this, this->devices_, true); } -TEST_F(CommonFixture, TestSemaphoresTensixIdleEth) { +TEST_F(DispatchFixture, TensixIdleEthTestSemaphores) { if (not this->slow_dispatch_) { GTEST_SKIP(); } @@ -181,7 +181,7 @@ TEST_F(CommonFixture, TestSemaphoresTensixIdleEth) { // This test was written to cover issue #12738 (CBs for workers showing up on // active eth cores) -TEST_F(CommonFixture, TestCBsAcrossWorkerEth) { +TEST_F(DispatchFixture, TensixActiveEthTestCBsAcrossDifferentCoreTypes) { uint32_t intermediate_cb = 24; uint32_t out_cb = 16; diff --git a/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp similarity index 76% rename from tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp rename to tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp index fcf79a112f0..1322f2f4331 100644 --- a/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp @@ -4,7 +4,7 @@ #include -#include "core_coord.hpp" +#include "common/core_coord.hpp" #include "detail/tt_metal.hpp" #include "host_api.hpp" #include "impl/device/device.hpp" @@ -12,32 +12,9 @@ #include "impl/kernels/kernel_types.hpp" #include "impl/program/program.hpp" #include "tt_cluster_descriptor_types.h" +#include "program_with_kernel_created_from_string_fixture.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" - -using namespace tt; -using namespace tt::tt_metal; - -class ProgramWithKernelCreatedFromStringFixture : public CommonFixture { - protected: - void SetUp() override { - CommonFixture::SetUp(); - for (Device *device : this->devices_) - { - const chip_id_t device_id = device->id(); - this->device_ids_to_devices_[device_id] = device; - } - } - - void TearDown() override { - detail::CloseDevices(this->device_ids_to_devices_); - } - - private: - std::map device_ids_to_devices_; -}; - -TEST_F(ProgramWithKernelCreatedFromStringFixture, DataMovementKernel) { +TEST_F(ProgramWithKernelCreatedFromStringFixture, TensixDataMovementKernel) { const CoreRange cores({0, 0}, {1, 1}); const string &kernel_src_code = R"( #include "debug/dprint.h" @@ -62,7 +39,7 @@ TEST_F(ProgramWithKernelCreatedFromStringFixture, DataMovementKernel) { }; } -TEST_F(ProgramWithKernelCreatedFromStringFixture, ComputeKernel) { +TEST_F(ProgramWithKernelCreatedFromStringFixture, TensixComputeKernel) { const CoreRange cores({0, 0}, {1, 1}); const string &kernel_src_code = R"( #include "debug/dprint.h" @@ -94,7 +71,7 @@ TEST_F(ProgramWithKernelCreatedFromStringFixture, ComputeKernel) { }; } -TEST_F(ProgramWithKernelCreatedFromStringFixture, EthernetKernel) { +TEST_F(ProgramWithKernelCreatedFromStringFixture, ActiveEthEthernetKernel) { const string &kernel_src_code = R"( #include "debug/dprint.h" #include "dataflow_api.h" diff --git a/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_stress.cpp similarity index 80% rename from tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp rename to tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_stress.cpp index 75116172d4d..f6247518800 100644 --- a/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_stress.cpp @@ -2,9 +2,10 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "common/logger.hpp" #include "gtest/gtest.h" #include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/host_api.hpp" +#include "host_api.hpp" #include "tt_metal/impl/device/device.hpp" using std::vector; @@ -14,34 +15,29 @@ using namespace tt::tt_metal; void RunTest(Device *device) { // Set up program Program program = Program(); + CoreRange core_range({0, 0}, {5, 5}); - std::set core_ranges; - //CoreCoord grid_size = device->logical_grid_size(); - CoreCoord grid_size = {5, 5}; - for (uint32_t y = 0; y < grid_size.y; y++) { - for (uint32_t x = 0; x < grid_size.x; x++) { - CoreCoord core(x, y); - core_ranges.insert(CoreRange(core, core)); - } - } + auto l1_unreserved_base = device->get_base_allocator_addr(tt_metal::HalMemType::L1); // Kernels on brisc + ncrisc that just add two numbers - KernelHandle brisc_kid = tt_metal::CreateKernel( + KernelHandle brisc_kid = CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp", - CoreRangeSet(core_ranges), + core_range, tt_metal::DataMovementConfig { .processor = DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default + .noc = NOC::RISCV_0_default, + .compile_args = {l1_unreserved_base} } ); - KernelHandle ncrisc_kid = tt_metal::CreateKernel( + KernelHandle ncrisc_kid = CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp", - CoreRangeSet(core_ranges), + core_range, tt_metal::DataMovementConfig { .processor = DataMovementProcessor::RISCV_1, - .noc = NOC::RISCV_1_default + .noc = NOC::RISCV_1_default, + .compile_args = {l1_unreserved_base + 4} } ); @@ -52,8 +48,8 @@ void RunTest(Device *device) { auto get_second_arg = [](Device *device, CoreCoord &core, uint32_t multiplier) { return (uint32_t) core.y * 100 * multiplier; }; - for (auto &core_range : core_ranges) { - CoreCoord core = core_range.start_coord; + + for (CoreCoord core : core_range) { std::vector brisc_rt_args = { get_first_arg(device, core, 1), get_second_arg(device, core, 1) @@ -78,16 +74,14 @@ void RunTest(Device *device) { } // Check results - uint32_t l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1); - for (auto &core_range : core_ranges) { - CoreCoord core = core_range.start_coord; + for (CoreCoord core : core_range) { std::vector brisc_result; tt_metal::detail::ReadFromDeviceL1( device, core, l1_unreserved_base, sizeof(uint32_t), brisc_result ); std::vector ncrisc_result; tt_metal::detail::ReadFromDeviceL1( - device, core, l1_unreserved_base, sizeof(uint32_t), ncrisc_result + device, core, l1_unreserved_base + 4, sizeof(uint32_t), ncrisc_result ); uint32_t expected_result = get_first_arg(device, core, 1) + get_second_arg(device, core, 1); if (expected_result != brisc_result[0]) @@ -114,13 +108,13 @@ void RunTest(Device *device) { } } -TEST(Common, AllCoresRunManyTimes) { +TEST(DispatchStress, TensixRunManyTimes) { auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); // Skip fast dispatch until it's supported for remote device. if (!slow_dispatch) GTEST_SKIP(); // Run 500 times to make sure that things work - for (int idx = 0; idx < 500; idx++) { + for (int idx = 0; idx < 400; idx++) { log_info(LogTest, "Running iteration #{}", idx); // Need to open/close the device each time in order to reproduce original issue. auto num_devices = tt::tt_metal::GetNumAvailableDevices(); @@ -136,11 +130,11 @@ TEST(Common, AllCoresRunManyTimes) { // Run the test on each device for (Device *device : devices_) { + log_info(LogTest, "Running on device {}", device->id()); RunTest(device); } // Close all devices tt::tt_metal::detail::CloseDevices(reserved_devices_); } - } diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp new file mode 100644 index 00000000000..f569ffd05c0 --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp @@ -0,0 +1,127 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "tt_metal/common/core_coord.hpp" +#include "tt_metal/impl/buffers/global_semaphore.hpp" +#include "tt_metal/impl/device/device.hpp" +#include "tt_metal/impl/event/event.hpp" +#include "tt_metal/impl/sub_device/sub_device.hpp" +#include "tt_metal/test_utils/stimulus.hpp" +#include "command_queue_fixture.hpp" +#include "sub_device_test_utils.hpp" +#include "dispatch_test_utils.hpp" + +TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceSynchronization) { + uint32_t local_l1_size = 3200; + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); + SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); + CoreRangeSet sharded_cores_1 = CoreRange({0, 0}, {2, 2}); + + auto sharded_cores_1_vec = corerange_to_cores(sharded_cores_1, std::nullopt, true); + + ShardSpecBuffer shard_spec_buffer_1 = ShardSpecBuffer(sharded_cores_1, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_1.num_cores(), 1}); + uint32_t page_size_1 = 32; + ShardedBufferConfig shard_config_1 = {nullptr, sharded_cores_1.num_cores() * page_size_1, page_size_1, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_1}; + auto input_1 = tt::test_utils::generate_uniform_random_vector(0, 100, shard_config_1.size / sizeof(uint32_t)); + + std::array sub_device_ids_to_block = {SubDeviceId{0}}; + for (Device *device : devices_) { + auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, local_l1_size); + + shard_config_1.device = device; + + std::vector physical_cores_1; + physical_cores_1.reserve(sharded_cores_1_vec.size()); + for (const auto& core : sharded_cores_1_vec) { + physical_cores_1.push_back(device->worker_core_from_logical_core(core)); + } + + device->load_sub_device_manager(sub_device_manager); + + auto [program, syncer_core, global_semaphore] = create_single_sync_program(device, sub_device_2); + EnqueueProgram(device->command_queue(), program, false); + + auto buffer_1 = CreateBuffer(shard_config_1, sub_device_ids_to_block[0]); + + // Test blocking synchronize doesn't stall + Synchronize(device, 0, sub_device_ids_to_block); + + // Test blocking write buffer doesn't stall + EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, true, sub_device_ids_to_block); + + // Test record event won't cause a stall + auto event = std::make_shared(); + EnqueueRecordEvent(device->command_queue(), event, sub_device_ids_to_block); + Synchronize(device, 0, sub_device_ids_to_block); + + // Test blocking read buffer doesn't stall + std::vector output_1; + EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true, sub_device_ids_to_block); + EXPECT_EQ(input_1, output_1); + auto input_1_it = input_1.begin(); + for (const auto& physical_core : physical_cores_1) { + auto readback = tt::llrt::read_hex_vec_from_core( + device->id(), physical_core, buffer_1->address(), page_size_1); + EXPECT_TRUE(std::equal(input_1_it, input_1_it + page_size_1 / sizeof(uint32_t), readback.begin())); + input_1_it += page_size_1 / sizeof(uint32_t); + } + auto sem_addr = global_semaphore->address(); + auto physical_syncer_core = device->worker_core_from_logical_core(syncer_core); + tt::llrt::write_hex_vec_to_core(device->id(), physical_syncer_core, std::vector{1}, sem_addr); + + // Full synchronization + Synchronize(device); + } +} + +TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceBasicPrograms) { + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); + SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); + uint32_t num_iters = 5; + for (Device *device : devices_) { + auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); + device->load_sub_device_manager(sub_device_manager); + + auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_sync_program(device, sub_device_1, sub_device_2); + + for (uint32_t i = 0; i < num_iters; i++) { + EnqueueProgram(device->command_queue(), waiter_program, false); + // Test blocking on one sub-device + EnqueueProgram(device->command_queue(), syncer_program, true); + EnqueueProgram(device->command_queue(), incrementer_program, false); + } + Synchronize(device); + } +} + +TEST_F(CommandQueueSingleCardFixture, TensixActiveEthTestSubDeviceBasicEthPrograms) { + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); + uint32_t num_iters = 5; + for (Device *device : devices_) { + if (!does_device_have_active_eth_cores(device)) { + GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores"; + } + auto eth_core = *device->get_active_ethernet_cores(true).begin(); + SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})}), CoreRangeSet(CoreRange(eth_core, eth_core))}); + auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); + device->load_sub_device_manager(sub_device_manager); + + auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_eth_sync_program(device, sub_device_1, sub_device_2); + + for (uint32_t i = 0; i < num_iters; i++) { + EnqueueProgram(device->command_queue(), waiter_program, false); + // Test blocking on one sub-device + EnqueueProgram(device->command_queue(), syncer_program, true); + EnqueueProgram(device->command_queue(), incrementer_program, false); + } + Synchronize(device); + } +} diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp b/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp new file mode 100644 index 00000000000..b0065f0b6b0 --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp @@ -0,0 +1,93 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include +#include "host_api.hpp" +#include "impl/kernels/kernel.hpp" + +struct TestBufferConfig { + uint32_t num_pages; + uint32_t page_size; + BufferType buftype; +}; + +inline std::vector generate_arange_vector(uint32_t size_bytes, uint32_t start = 0) { + TT_FATAL(size_bytes % sizeof(uint32_t) == 0, "Error"); + std::vector src(size_bytes / sizeof(uint32_t), 0); + + for (uint32_t i = 0; i < src.size(); i++) { + src.at(i) = start + i; + } + return src; +} + +inline std::pair, std::vector> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) { + // This function just enqueues a buffer (which should be large in the config) + // write as a precursor to testing the wrap mechanism + size_t buf_size = config.num_pages * config.page_size; + auto buffer = Buffer::create(device, buf_size, config.page_size, config.buftype); + + std::vector src = create_random_vector_of_bfloat16( + buf_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); + + EnqueueWriteBuffer(cq, *buffer, src, false); + return std::make_pair(std::move(buffer), src); +} + +inline bool does_device_have_active_eth_cores(const Device *device) { + return !(device->get_active_ethernet_cores(true).empty()); +} + +inline std::pair, std::vector> create_runtime_args( + const uint32_t num_unique_rt_args, + const uint32_t num_common_rt_args, + const uint32_t unique_base, + const uint32_t common_base) { + TT_FATAL( + num_unique_rt_args + num_common_rt_args <= tt::tt_metal::max_runtime_args, + "Number of unique runtime args and common runtime args exceeds the maximum limit of {} runtime args", + tt::tt_metal::max_runtime_args); + + std::vector common_rt_args; + for (uint32_t i = 0; i < num_common_rt_args; i++) { + common_rt_args.push_back(common_base + i); + } + + std::vector unique_rt_args; + for (uint32_t i = 0; i < num_unique_rt_args; i++) { + unique_rt_args.push_back(unique_base + i); + } + + return std::make_pair(unique_rt_args, common_rt_args); +} + +// Create randomly sized pair of unique and common runtime args vectors, with careful not to exceed max between the two. +// Optionally force the max size for one of the vectors. +inline std::pair, std::vector> create_runtime_args( + const bool force_max_size = false, const uint32_t unique_base = 0, const uint32_t common_base = 100) { + uint32_t num_rt_args_unique = rand() % (tt::tt_metal::max_runtime_args + 1); + uint32_t num_rt_args_common = + num_rt_args_unique < tt::tt_metal::max_runtime_args ? rand() % (tt::tt_metal::max_runtime_args - num_rt_args_unique + 1) : 0; + + if (force_max_size) { + if (rand() % 2) { + num_rt_args_unique = tt::tt_metal::max_runtime_args; + num_rt_args_common = 0; + } else { + num_rt_args_common = tt::tt_metal::max_runtime_args; + num_rt_args_unique = 0; + } + } + + log_trace( + tt::LogTest, + "{} - num_rt_args_unique: {} num_rt_args_common: {} force_max_size: {}", + __FUNCTION__, + num_rt_args_unique, + num_rt_args_common, + force_max_size); + + return create_runtime_args(num_rt_args_unique, num_rt_args_common, unique_base, common_base); +} diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/CMakeLists.txt new file mode 100644 index 00000000000..f7092ac68e5 --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/CMakeLists.txt @@ -0,0 +1,34 @@ +set(UNIT_TESTS_DISPATCH_TRACE_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueTrace.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sub_device.cpp +) + +add_library(unit_tests_dispatch_trace_o STATIC ${UNIT_TESTS_DISPATCH_TRACE_SRC}) + +target_link_libraries(unit_tests_dispatch_trace_o PRIVATE test_metal_common_libs) + +target_include_directories( + unit_tests_dispatch_trace_o + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) + +add_executable(unit_tests_dispatch_trace $) + +target_link_libraries(unit_tests_dispatch_trace PRIVATE test_metal_common_libs) + +set_target_properties( + unit_tests_dispatch_trace + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) + +TT_ENABLE_UNITY_BUILD(unit_tests_dispatch_trace) diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp similarity index 80% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp rename to tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp index c5e46f1fd3b..4d5a3a6e37e 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp @@ -3,12 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include -#include #include -#include "command_queue_fixture.hpp" -#include "command_queue_test_utils.hpp" +#include "multi_command_queue_fixture.hpp" +#include "random_program_fixture.hpp" +#include "dispatch_test_utils.hpp" #include "detail/tt_metal.hpp" #include "tt_metal/common/env_lib.hpp" #include "gtest/gtest.h" @@ -92,8 +91,176 @@ constexpr bool kBlocking = true; constexpr bool kNonBlocking = false; vector blocking_flags = {kBlocking, kNonBlocking}; -TEST_F(SingleDeviceTraceFixture, InstantiateTraceSanity) { - Setup(2048); +TEST_F(MultiCommandQueueSingleDeviceTraceFixture, TensixEnqueueOneProgramTrace) { + CreateDevice(2048); + auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + + CommandQueue& command_queue = this->device_->command_queue(0); + CommandQueue& data_movement_queue = this->device_->command_queue(1); + + Program simple_program = create_simple_unary_program(*input, *output); + vector input_data(input->size() / sizeof(uint32_t), 0); + for (uint32_t i = 0; i < input_data.size(); i++) { + input_data[i] = i; + } + + // Eager mode + vector eager_output_data; + eager_output_data.resize(input_data.size()); + + EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); + EnqueueProgram(command_queue, simple_program, true); + EnqueueReadBuffer(data_movement_queue, output, eager_output_data.data(), true); + + // Trace mode + vector trace_output_data; + trace_output_data.resize(input_data.size()); + + EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); + + uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); + EnqueueProgram(command_queue, simple_program, false); + EndTraceCapture(this->device_, command_queue.id(), tid); + + EnqueueTrace(command_queue, tid, true); + EnqueueReadBuffer(data_movement_queue, *output, trace_output_data.data(), true); + EXPECT_TRUE(eager_output_data == trace_output_data); + + // Done + Finish(command_queue); + ReleaseTrace(this->device_, tid); +} + +TEST_F(MultiCommandQueueSingleDeviceTraceFixture, TensixEnqueueOneProgramTraceLoops) { + CreateDevice(4096); + auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + + CommandQueue& command_queue = this->device_->command_queue(0); + CommandQueue& data_movement_queue = this->device_->command_queue(1); + + Program simple_program = create_simple_unary_program(*input, *output); + vector input_data(input->size() / sizeof(uint32_t), 0); + for (uint32_t i = 0; i < input_data.size(); i++) { + input_data[i] = i; + } + + // Trace mode output + uint32_t num_loops = 10; + vector> trace_outputs; + + for (auto i = 0; i < num_loops; i++) { + trace_outputs.push_back({}); + trace_outputs[i].resize(input_data.size()); + } + + // Compile + EnqueueProgram(command_queue, simple_program, true); + + // Trace mode execution + uint32_t trace_id = 0; + bool trace_captured = false; + for (auto i = 0; i < num_loops; i++) { + EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); + + if (not trace_captured) { + trace_id = BeginTraceCapture(this->device_, command_queue.id()); + EnqueueProgram(command_queue, simple_program, false); + EndTraceCapture(this->device_, command_queue.id(), trace_id); + trace_captured = true; + } + + EnqueueTrace(command_queue, trace_id, false); + EnqueueReadBuffer(data_movement_queue, *output, trace_outputs[i].data(), true); + + // Expect same output across all loops + EXPECT_TRUE(trace_outputs[i] == trace_outputs[0]); + } + + // Done + Finish(command_queue); + ReleaseTrace(this->device_, trace_id); +} + +TEST_F(MultiCommandQueueSingleDeviceTraceFixture, TensixEnqueueOneProgramTraceBenchmark) { + CreateDevice(6144); + auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); + + constexpr bool kBlocking = true; + constexpr bool kNonBlocking = false; + vector blocking_flags = {kBlocking, kNonBlocking}; + + // Single Q for data and commands + // Keep this queue in passthrough mode for now + CommandQueue& command_queue = this->device_->command_queue(0); + + auto simple_program = create_simple_unary_program(*input, *output); + vector input_data(input->size() / sizeof(uint32_t), 0); + for (uint32_t i = 0; i < input_data.size(); i++) { + input_data[i] = i; + } + + // Trace mode output + uint32_t num_loops = 10; + vector> trace_outputs; + + for (auto i = 0; i < num_loops; i++) { + trace_outputs.push_back({}); + trace_outputs[i].resize(input_data.size()); + } + + // Eager mode + vector expected_output_data; + vector eager_output_data; + expected_output_data.resize(input_data.size()); + eager_output_data.resize(input_data.size()); + + // Warm up and use the eager blocking run as the expected output + EnqueueWriteBuffer(command_queue, *input, input_data.data(), kBlocking); + EnqueueProgram(command_queue, simple_program, kBlocking); + EnqueueReadBuffer(command_queue, *output, expected_output_data.data(), kBlocking); + Finish(command_queue); + + for (bool blocking : blocking_flags) { + std::string mode = blocking ? "Eager-B" : "Eager-NB"; + for (auto i = 0; i < num_loops; i++) { + tt::ScopedTimer timer(mode + " loop " + std::to_string(i)); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), blocking); + EnqueueProgram(command_queue, simple_program, blocking); + EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), blocking); + } + if (not blocking) { + // (Optional) wait for the last non-blocking command to finish + Finish(command_queue); + } + EXPECT_TRUE(eager_output_data == expected_output_data); + } + + // Capture trace on a trace queue + uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); + EnqueueProgram(command_queue, simple_program, false); + EndTraceCapture(this->device_, command_queue.id(), tid); + + // Trace mode execution + for (auto i = 0; i < num_loops; i++) { + tt::ScopedTimer timer("Trace loop " + std::to_string(i)); + EnqueueWriteBuffer(command_queue, *input, input_data.data(), kNonBlocking); + EnqueueTrace(command_queue, tid, kNonBlocking); + EnqueueReadBuffer(command_queue, *output, trace_outputs[i].data(), kNonBlocking); + } + Finish(command_queue); + + // Expect same output across all loops + for (auto i = 0; i < num_loops; i++) { + EXPECT_TRUE(trace_outputs[i] == trace_outputs[0]); + } + ReleaseTrace(this->device_, tid); +} + +TEST_F(CommandQueueTraceFixture, TensixInstantiateTraceSanity) { + CreateDevice(2048); CommandQueue& command_queue = this->device_->command_queue(); auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); @@ -124,8 +291,8 @@ TEST_F(SingleDeviceTraceFixture, InstantiateTraceSanity) { ReleaseTrace(this->device_, tid); } -TEST_F(SingleDeviceTraceFixture, EnqueueProgramTraceCapture) { - Setup(2048); +TEST_F(CommandQueueTraceFixture, TensixEnqueueProgramTraceCapture) { + CreateDevice(2048); auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); @@ -167,8 +334,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramTraceCapture) { ReleaseTrace(this->device_, tid); } -TEST_F(SingleDeviceTraceFixture, EnqueueProgramDeviceCapture) { - Setup(2048); +TEST_F(CommandQueueTraceFixture, TensixEnqueueProgramDeviceCapture) { + CreateDevice(2048); auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); @@ -218,8 +385,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramDeviceCapture) { ReleaseTrace(this->device_, tid); } -TEST_F(SingleDeviceTraceFixture, EnqueueTwoProgramTrace) { - Setup(6144); +TEST_F(CommandQueueTraceFixture, TensixEnqueueTwoProgramTrace) { + CreateDevice(6144); // Get command queue from device for this test, since its running in async mode CommandQueue& command_queue = this->device_->command_queue(); @@ -294,8 +461,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueTwoProgramTrace) { } } -TEST_F(SingleDeviceTraceFixture, EnqueueMultiProgramTraceBenchmark) { - Setup(6144); +TEST_F(CommandQueueTraceFixture, TensixEnqueueMultiProgramTraceBenchmark) { + CreateDevice(6144); CommandQueue& command_queue = this->device_->command_queue(); auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp new file mode 100644 index 00000000000..bce5b241610 --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp @@ -0,0 +1,269 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "tt_metal/common/core_coord.hpp" +#include "tt_metal/impl/buffers/global_semaphore.hpp" +#include "tt_metal/impl/device/device.hpp" +#include "tt_metal/impl/event/event.hpp" +#include "tt_metal/impl/sub_device/sub_device.hpp" +#include "command_queue_fixture.hpp" +#include "dispatch_test_utils.hpp" +#include "sub_device_test_utils.hpp" + +TEST_F(CommandQueueSingleCardTraceFixture, TensixTestSubDeviceTraceBasicPrograms) { + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); + SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); + uint32_t num_iters = 5; + for (Device *device : devices_) { + auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); + device->load_sub_device_manager(sub_device_manager); + + auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_sync_program(device, sub_device_1, sub_device_2); + + // Compile the programs + EnqueueProgram(device->command_queue(), waiter_program, false); + // Test blocking on one sub-device + EnqueueProgram(device->command_queue(), syncer_program, true); + EnqueueProgram(device->command_queue(), incrementer_program, false); + Synchronize(device); + + // Capture the trace + auto tid_1 = BeginTraceCapture(device, device->command_queue().id()); + EnqueueProgram(device->command_queue(), waiter_program, false); + EnqueueProgram(device->command_queue(), syncer_program, false); + EnqueueProgram(device->command_queue(), incrementer_program, false); + EndTraceCapture(device, device->command_queue().id(), tid_1); + + auto tid_2 = BeginTraceCapture(device, device->command_queue().id()); + EnqueueProgram(device->command_queue(), syncer_program, false); + EnqueueProgram(device->command_queue(), incrementer_program, false); + EndTraceCapture(device, device->command_queue().id(), tid_2); + + for (uint32_t i = 0; i < num_iters; i++) { + // Regular program execution + EnqueueProgram(device->command_queue(), waiter_program, false); + // Test blocking on one sub-device + EnqueueProgram(device->command_queue(), syncer_program, true); + EnqueueProgram(device->command_queue(), incrementer_program, false); + + // Full trace execution + ReplayTrace(device, device->command_queue().id(), tid_1, false); + + // Partial trace execution + EnqueueProgram(device->command_queue(), waiter_program, false); + ReplayTrace(device, device->command_queue().id(), tid_2, false); + } + Synchronize(device); + } +} + +TEST_F(CommandQueueSingleCardTraceFixture, TensixActiveEthTestSubDeviceTraceBasicEthPrograms) { + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); + uint32_t num_iters = 5; + for (Device *device : devices_) { + if (!does_device_have_active_eth_cores(device)) { + GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores"; + } + auto eth_core = *device->get_active_ethernet_cores(true).begin(); + SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})}), CoreRangeSet(CoreRange(eth_core, eth_core))}); + auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); + device->load_sub_device_manager(sub_device_manager); + + auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_eth_sync_program(device, sub_device_1, sub_device_2); + + // Compile the programs + EnqueueProgram(device->command_queue(), waiter_program, false); + // Test blocking on one sub-device + EnqueueProgram(device->command_queue(), syncer_program, true); + EnqueueProgram(device->command_queue(), incrementer_program, false); + Synchronize(device); + + // Capture the trace + auto tid_1 = BeginTraceCapture(device, device->command_queue().id()); + EnqueueProgram(device->command_queue(), waiter_program, false); + EnqueueProgram(device->command_queue(), syncer_program, false); + EnqueueProgram(device->command_queue(), incrementer_program, false); + EndTraceCapture(device, device->command_queue().id(), tid_1); + + auto tid_2 = BeginTraceCapture(device, device->command_queue().id()); + EnqueueProgram(device->command_queue(), syncer_program, false); + EnqueueProgram(device->command_queue(), incrementer_program, false); + EndTraceCapture(device, device->command_queue().id(), tid_2); + + for (uint32_t i = 0; i < num_iters; i++) { + // Regular program execution + EnqueueProgram(device->command_queue(), waiter_program, false); + // Test blocking on one sub-device + EnqueueProgram(device->command_queue(), syncer_program, true); + EnqueueProgram(device->command_queue(), incrementer_program, false); + + // Full trace execution + ReplayTrace(device, device->command_queue().id(), tid_1, false); + + // Partial trace execution + EnqueueProgram(device->command_queue(), waiter_program, false); + ReplayTrace(device, device->command_queue().id(), tid_2, false); + } + Synchronize(device); + } +} + +TEST_F(CommandQueueSingleCardTraceFixture, TensixActiveEthTestSubDeviceTraceProgramsReconfigureSubDevices) { + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); + SubDevice sub_device_2(std::array{CoreRangeSet(std::array{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); + SubDevice sub_device_3(std::array{CoreRangeSet(std::array{CoreRange({2, 4}, {3, 4}), CoreRange({5, 1}, {6, 3})})}); + uint32_t num_iters = 5; + for (Device *device : devices_) { + if (!does_device_have_active_eth_cores(device)) { + GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores"; + } + auto eth_core = *device->get_active_ethernet_cores(true).begin(); + SubDevice sub_device_4(std::array{CoreRangeSet(std::array{CoreRange({2, 1}, {2, 2}), CoreRange({1, 5}, {5, 5})}), CoreRangeSet(CoreRange(eth_core, eth_core))}); + + auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); + auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_3, sub_device_4}, 3200); + + device->load_sub_device_manager(sub_device_manager_1); + + auto [waiter_program_1, syncer_program_1, incrementer_program_1, global_sem_1] = create_basic_sync_program(device, sub_device_1, sub_device_2); + + // Compile the programs + EnqueueProgram(device->command_queue(), waiter_program_1, false); + EnqueueProgram(device->command_queue(), syncer_program_1, false); + EnqueueProgram(device->command_queue(), incrementer_program_1, false); + Synchronize(device); + + // Capture the trace + auto tid_1 = BeginTraceCapture(device, device->command_queue().id()); + EnqueueProgram(device->command_queue(), waiter_program_1, false); + EnqueueProgram(device->command_queue(), syncer_program_1, false); + EnqueueProgram(device->command_queue(), incrementer_program_1, false); + EndTraceCapture(device, device->command_queue().id(), tid_1); + + auto tid_2 = BeginTraceCapture(device, device->command_queue().id()); + EnqueueProgram(device->command_queue(), syncer_program_1, false); + EnqueueProgram(device->command_queue(), incrementer_program_1, false); + EndTraceCapture(device, device->command_queue().id(), tid_2); + + device->load_sub_device_manager(sub_device_manager_2); + + auto [waiter_program_2, syncer_program_2, incrementer_program_2, global_sem_2] = create_basic_eth_sync_program(device, sub_device_3, sub_device_4); + + // Compile the programs + EnqueueProgram(device->command_queue(), waiter_program_2, false); + EnqueueProgram(device->command_queue(), syncer_program_2, false); + EnqueueProgram(device->command_queue(), incrementer_program_2, false); + Synchronize(device); + + // Capture the trace + auto tid_3 = BeginTraceCapture(device, device->command_queue().id()); + EnqueueProgram(device->command_queue(), waiter_program_2, false); + EnqueueProgram(device->command_queue(), syncer_program_2, false); + EnqueueProgram(device->command_queue(), incrementer_program_2, false); + EndTraceCapture(device, device->command_queue().id(), tid_3); + + auto tid_4 = BeginTraceCapture(device, device->command_queue().id()); + EnqueueProgram(device->command_queue(), syncer_program_2, false); + EnqueueProgram(device->command_queue(), incrementer_program_2, false); + EndTraceCapture(device, device->command_queue().id(), tid_4); + + for (uint32_t i = 0; i < num_iters; i++) { + device->load_sub_device_manager(sub_device_manager_1); + // Regular program execution + EnqueueProgram(device->command_queue(), waiter_program_1, false); + // Test blocking on one sub-device + EnqueueProgram(device->command_queue(), syncer_program_1, false); + EnqueueProgram(device->command_queue(), incrementer_program_1, false); + + // Full trace execution + ReplayTrace(device, device->command_queue().id(), tid_1, false); + + // Partial trace execution + EnqueueProgram(device->command_queue(), waiter_program_1, false); + ReplayTrace(device, device->command_queue().id(), tid_2, false); + + device->load_sub_device_manager(sub_device_manager_2); + // Regular program execution + EnqueueProgram(device->command_queue(), waiter_program_2, false); + // Test blocking on one sub-device + EnqueueProgram(device->command_queue(), syncer_program_2, false); + EnqueueProgram(device->command_queue(), incrementer_program_2, false); + + // Full trace execution + ReplayTrace(device, device->command_queue().id(), tid_3, false); + + // Partial trace execution + EnqueueProgram(device->command_queue(), waiter_program_2, false); + ReplayTrace(device, device->command_queue().id(), tid_4, false); + } + Synchronize(device); + } +} + +TEST_F(CommandQueueSingleCardTraceFixture, TensixTestSubDeviceIllegalOperations) { + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); + SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); + + // Assert no idle eth cores specified + EXPECT_THROW(SubDevice sub_device_3(std::array{CoreRangeSet(CoreRange({3, 3}, {3, 3})), CoreRangeSet(CoreRange({4, 4}, {4, 4})), CoreRangeSet(CoreRange({5, 5}, {5, 5}))}), std::exception); + for (Device *device : devices_) { + auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); + auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_2, sub_device_1}, 3200); + device->load_sub_device_manager(sub_device_manager_1); + + auto [waiter_program_1, syncer_program_1, incrementer_program_1, global_sem_1] = create_basic_sync_program(device, sub_device_1, sub_device_2); + + // Compile the programs + EnqueueProgram(device->command_queue(), waiter_program_1, false); + // Test blocking on one sub-device + EnqueueProgram(device->command_queue(), syncer_program_1, false); + EnqueueProgram(device->command_queue(), incrementer_program_1, false); + Synchronize(device); + + // Capture the trace + auto tid_1 = BeginTraceCapture(device, device->command_queue().id()); + // Can not load a sub-device manager while tracing + EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception); + EnqueueProgram(device->command_queue(), waiter_program_1, false); + EnqueueProgram(device->command_queue(), syncer_program_1, false); + EnqueueProgram(device->command_queue(), incrementer_program_1, false); + EndTraceCapture(device, device->command_queue().id(), tid_1); + + device->load_sub_device_manager(sub_device_manager_2); + auto [waiter_program_2, syncer_program_2, incrementer_program_2, global_sem_2] = create_basic_sync_program(device, sub_device_2, sub_device_1); + + EnqueueProgram(device->command_queue(), waiter_program_2, false); + EnqueueProgram(device->command_queue(), syncer_program_2, false); + EnqueueProgram(device->command_queue(), incrementer_program_2, false); + Synchronize(device); + + auto tid_2 = BeginTraceCapture(device, device->command_queue().id()); + EnqueueProgram(device->command_queue(), waiter_program_2, false); + EnqueueProgram(device->command_queue(), syncer_program_2, false); + EnqueueProgram(device->command_queue(), incrementer_program_2, false); + EndTraceCapture(device, device->command_queue().id(), tid_2); + + // Regular program execution + // Can not run a program on a different sub-device manager + EXPECT_THROW(EnqueueProgram(device->command_queue(), waiter_program_1, false), std::exception); + + // Full trace execution + ReplayTrace(device, device->command_queue().id(), tid_2, false); + + // Can not replay a trace on a different sub-device manager + EXPECT_THROW(ReplayTrace(device, device->command_queue().id(), tid_1, false), std::exception); + + Synchronize(device); + + device->remove_sub_device_manager(sub_device_manager_1); + EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_1), std::exception); + } +} diff --git a/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp new file mode 100644 index 00000000000..d5d6326cc79 --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp @@ -0,0 +1,150 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "gtest/gtest.h" +#include "dispatch_fixture.hpp" +#include "hostdevcommon/common_values.hpp" +#include "impl/device/device.hpp" +#include "llrt/hal.hpp" +#include "tt_cluster_descriptor_types.h" +#include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/test_utils/env_vars.hpp" +#include "tt_metal/impl/kernels/kernel.hpp" +#include "tt_metal/common/tt_backend_api_types.hpp" +#include "tt_metal/llrt/rtoptions.hpp" + +class MultiCommandQueueSingleDeviceFixture : public DispatchFixture { + protected: + void SetUp() override { + this->validate_dispatch_mode(); + + this->num_cqs_ = tt::llrt::OptionsG.get_num_hw_cqs(); + if (this->num_cqs_ != 2) { + tt::log_info(tt::LogTest, "This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2"); + GTEST_SKIP(); + } + + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + + const chip_id_t device_id = 0; + const DispatchCoreType dispatch_core_type = this->get_dispatch_core_type(); + this->create_device(device_id, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + } + + void TearDown() override { + if (this->device_ != nullptr) { + tt::tt_metal::CloseDevice(this->device_); + } + } + + void validate_dispatch_mode() { + this->slow_dispatch_ = false; + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + this->slow_dispatch_ = true; + GTEST_SKIP(); + } + } + + DispatchCoreType get_dispatch_core_type() { + DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER; + if (this->arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) { + if (!tt::tt_metal::IsGalaxyCluster()) { + tt::log_warning( + tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in SetUp()"); + dispatch_core_type = DispatchCoreType::ETH; + } + } + return dispatch_core_type; + } + + void create_device( + const chip_id_t device_id, + const size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE, + const DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER) { + this->device_ = tt::tt_metal::CreateDevice( + device_id, this->num_cqs_, DEFAULT_L1_SMALL_SIZE, trace_region_size, dispatch_core_type); + } + + tt::tt_metal::Device *device_ = nullptr; + tt::ARCH arch_; + uint8_t num_cqs_; +}; + +class MultiCommandQueueSingleDeviceEventFixture : public MultiCommandQueueSingleDeviceFixture {}; + +class MultiCommandQueueSingleDeviceBufferFixture : public MultiCommandQueueSingleDeviceFixture {}; + +class MultiCommandQueueSingleDeviceProgramFixture : public MultiCommandQueueSingleDeviceFixture {}; + +class MultiCommandQueueSingleDeviceTraceFixture : public MultiCommandQueueSingleDeviceFixture { + protected: + void SetUp() override { + this->validate_dispatch_mode(); + + this->num_cqs_ = tt::llrt::OptionsG.get_num_hw_cqs(); + if (this->num_cqs_ != 2) { + tt::log_info(tt::LogTest, "This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2"); + GTEST_SKIP(); + } + + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + } + + void CreateDevice(const size_t trace_region_size) { + const chip_id_t device_id = 0; + const DispatchCoreType dispatch_core_type = this->get_dispatch_core_type(); + this->create_device(device_id, trace_region_size, dispatch_core_type); + } + + DispatchCoreType dispatch_core_type_; +}; + +class MultiCommandQueueMultiDeviceFixture : public DispatchFixture { + protected: + void SetUp() override { + this->slow_dispatch_ = false; + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + if (slow_dispatch) { + tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); + this->slow_dispatch_ = true; + GTEST_SKIP(); + } + + auto num_cqs = tt::llrt::OptionsG.get_num_hw_cqs(); + if (num_cqs != 2) { + tt::log_info(tt::LogTest, "This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2"); + GTEST_SKIP(); + } + + const tt::ARCH arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + + DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER; + if (arch == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) { + if (!tt::tt_metal::IsGalaxyCluster()) { + tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()"); + dispatch_core_type = DispatchCoreType::ETH; + } + } + + const chip_id_t mmio_device_id = 0; + reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id}, num_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); + for (const auto &[id, device] : reserved_devices_) { + devices_.push_back(device); + } + } + + void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); } + + std::vector devices_; + std::map reserved_devices_; +}; + +class MultiCommandQueueMultiDeviceBufferFixture : public MultiCommandQueueMultiDeviceFixture {}; + +class MultiCommandQueueMultiDeviceEventFixture : public MultiCommandQueueMultiDeviceFixture {}; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp similarity index 72% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp rename to tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp index b5efa2e0729..02900995280 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp +++ b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp @@ -1,178 +1,19 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 #pragma once -#include -#include -#include -#include -#include "common/core_coord.hpp" -#include "common/env_lib.hpp" -#include "gtest/gtest.h" -#include "hostdevcommon/common_values.hpp" -#include "impl/buffers/circular_buffer_types.hpp" +#include "command_queue_fixture.hpp" #include "impl/device/device.hpp" -#include "impl/kernels/data_types.hpp" -#include "impl/kernels/kernel_types.hpp" -#include "impl/dispatch/command_queue.hpp" #include "llrt/hal.hpp" -#include "tt_cluster_descriptor_types.h" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/test_utils/env_vars.hpp" #include "tt_metal/impl/kernels/kernel.hpp" #include "tt_metal/common/tt_backend_api_types.hpp" -#include "tt_metal/llrt/rtoptions.hpp" -#include "tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp" -#include "tt_soc_descriptor.h" +#include "dispatch_test_utils.hpp" -class CommandQueueFixture : public ::testing::Test { - protected: - tt::ARCH arch_; - tt::tt_metal::Device* device_; - void SetUp() override { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch) { - tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); - GTEST_SKIP(); - } - this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - - const int device_id = 0; - - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - this->device_ = tt::tt_metal::CreateDevice(device_id, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - } - - void TearDown() override { - if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ - tt::tt_metal::CloseDevice(this->device_); - } - } -}; - - -class CommandQueueMultiDeviceFixture : public ::testing::Test { - protected: - void SetUp() override { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch) { - TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); - GTEST_SKIP(); - } - arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - - num_devices_ = tt::tt_metal::GetNumAvailableDevices(); - if (num_devices_ < 2 ) { - GTEST_SKIP(); - } - std::vector chip_ids; - for (unsigned int id = 0; id < num_devices_; id++) { - chip_ids.push_back(id); - } - - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - reserved_devices_ = tt::tt_metal::detail::CreateDevices(chip_ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - for (const auto &[id, device] : reserved_devices_) { - devices_.push_back(device); - } - } - - void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); } - - std::vector devices_; - std::map reserved_devices_; - tt::ARCH arch_; - size_t num_devices_; -}; - -class CommandQueueSingleCardFixture : public ::testing::Test { - protected: - void SetUp() override { - this->validate_dispatch_mode(); - this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - this->create_devices(); - } - - void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); } - - void validate_dispatch_mode() { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch) { - TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); - GTEST_SKIP(); - } - } - - void create_devices(const std::size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE) { - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - const chip_id_t mmio_device_id = 0; - this->reserved_devices_ = tt::tt_metal::detail::CreateDevices( - {mmio_device_id}, 1, DEFAULT_L1_SMALL_SIZE, trace_region_size, dispatch_core_type); - auto enable_remote_chip = getenv("TT_METAL_ENABLE_REMOTE_CHIP"); - if (enable_remote_chip) { - for (const auto &[id, device] : this->reserved_devices_) { - this->devices_.push_back(device); - } - } else { - this->devices_.push_back(this->reserved_devices_.at(mmio_device_id)); - } - - this->num_devices_ = this->reserved_devices_.size(); - } - - std::vector devices_; - std::map reserved_devices_; - tt::ARCH arch_; - size_t num_devices_; -}; - -class CommandQueueSingleCardTraceFixture : virtual public CommandQueueSingleCardFixture { - protected: - void SetUp() override { - this->validate_dispatch_mode(); - this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - this->create_devices(90000000); - } -}; - -class SingleDeviceTraceFixture: public ::testing::Test { -protected: - tt::tt_metal::Device* device_; - tt::ARCH arch_; - - void Setup(const size_t buffer_size, const uint8_t num_hw_cqs = 1) { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch) { - tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); - GTEST_SKIP(); - } - if (num_hw_cqs > 1) { - // Running multi-CQ test. User must set this explicitly. - auto num_cqs = getenv("TT_METAL_GTEST_NUM_HW_CQS"); - if (num_cqs == nullptr or strcmp(num_cqs, "2")) { - TT_THROW("This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2"); - GTEST_SKIP(); - } - } - this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - const int device_id = 0; - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - const chip_id_t mmio_device_id = 0; - this->device_ = tt::tt_metal::detail::CreateDevices({mmio_device_id}, 1, DEFAULT_L1_SMALL_SIZE, buffer_size, dispatch_core_type).at(mmio_device_id); - } - - void TearDown() override { - if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")) { - tt::tt_metal::CloseDevice(this->device_); - } - } - -}; - -class RandomProgramFixture : virtual public CommandQueueSingleCardFixture { +class RandomProgramFixture : virtual public CommandQueueSingleCardProgramFixture { protected: static const uint32_t MIN_KERNEL_SIZE_BYTES = 20; static const uint32_t MAX_KERNEL_SIZE_BYTES = 4096; @@ -225,7 +66,7 @@ class RandomProgramFixture : virtual public CommandQueueSingleCardFixture { Device *device_; void SetUp() override { - CommandQueueSingleCardFixture::SetUp(); + CommandQueueSingleCardProgramFixture::SetUp(); this->device_ = this->devices_[0]; this->initialize_seed(); } @@ -510,7 +351,7 @@ class RandomProgramFixture : virtual public CommandQueueSingleCardFixture { } }; -class RandomProgramTraceFixture : public RandomProgramFixture, public CommandQueueSingleCardTraceFixture { +class RandomProgramTraceFixture : virtual public RandomProgramFixture, virtual public CommandQueueSingleCardTraceFixture { protected: static const uint32_t NUM_TRACE_ITERATIONS = 50; Program programs[NUM_PROGRAMS]; diff --git a/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp b/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp new file mode 100644 index 00000000000..fd02ab47296 --- /dev/null +++ b/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp @@ -0,0 +1,121 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "host_api.hpp" + +inline std::tuple> create_single_sync_program( + Device* device, SubDevice sub_device) { + auto syncer_coord = sub_device.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord; + auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord)); + auto global_sem = CreateGlobalSemaphore(device, sub_device.cores(HalProgrammableCoreType::TENSIX), INVALID); + + Program syncer_program = CreateProgram(); + auto syncer_kernel = CreateKernel( + syncer_program, + "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/syncer.cpp", + syncer_core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + std::array syncer_rt_args = {global_sem->address()}; + SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args); + return {std::move(syncer_program), std::move(syncer_coord), std::move(global_sem)}; +} + +inline std::tuple> create_basic_sync_program( + Device* device, const SubDevice& sub_device_1, const SubDevice& sub_device_2) { + auto waiter_coord = sub_device_2.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord; + auto waiter_core = CoreRangeSet(CoreRange(waiter_coord, waiter_coord)); + auto waiter_core_physical = device->worker_core_from_logical_core(waiter_coord); + auto incrementer_cores = sub_device_1.cores(HalProgrammableCoreType::TENSIX); + auto syncer_coord = incrementer_cores.ranges().back().end_coord; + auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord)); + auto syncer_core_physical = device->worker_core_from_logical_core(syncer_coord); + auto all_cores = waiter_core.merge(incrementer_cores).merge(syncer_core); + auto global_sem = CreateGlobalSemaphore(device, all_cores, INVALID); + + Program waiter_program = CreateProgram(); + auto waiter_kernel = CreateKernel( + waiter_program, + "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_waiter.cpp", + waiter_core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + std::array waiter_rt_args = { + global_sem->address(), incrementer_cores.num_cores(), syncer_core_physical.x, syncer_core_physical.y}; + SetRuntimeArgs(waiter_program, waiter_kernel, waiter_core, waiter_rt_args); + + Program syncer_program = CreateProgram(); + auto syncer_kernel = CreateKernel( + syncer_program, + "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/syncer.cpp", + syncer_core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + std::array syncer_rt_args = {global_sem->address()}; + SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args); + + Program incrementer_program = CreateProgram(); + auto incrementer_kernel = CreateKernel( + incrementer_program, + "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/incrementer.cpp", + incrementer_cores, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + std::array incrementer_rt_args = { + global_sem->address(), waiter_core_physical.x, waiter_core_physical.y}; + SetRuntimeArgs(incrementer_program, incrementer_kernel, incrementer_cores, incrementer_rt_args); + return { + std::move(waiter_program), std::move(syncer_program), std::move(incrementer_program), std::move(global_sem)}; +} + +inline std::tuple> create_basic_eth_sync_program( + Device* device, const SubDevice& sub_device_1, const SubDevice& sub_device_2) { + auto waiter_coord = sub_device_2.cores(HalProgrammableCoreType::ACTIVE_ETH).ranges().at(0).start_coord; + auto waiter_core = CoreRangeSet(CoreRange(waiter_coord, waiter_coord)); + auto waiter_core_physical = device->ethernet_core_from_logical_core(waiter_coord); + auto tensix_waiter_coord = sub_device_2.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord; + auto tensix_waiter_core = CoreRangeSet(CoreRange(tensix_waiter_coord, tensix_waiter_coord)); + auto tensix_waiter_core_physical = device->worker_core_from_logical_core(tensix_waiter_coord); + auto incrementer_cores = sub_device_1.cores(HalProgrammableCoreType::TENSIX); + auto syncer_coord = incrementer_cores.ranges().back().end_coord; + auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord)); + auto syncer_core_physical = device->worker_core_from_logical_core(syncer_coord); + auto all_cores = tensix_waiter_core.merge(incrementer_cores).merge(syncer_core); + auto global_sem = CreateGlobalSemaphore(device, all_cores, INVALID); + + Program waiter_program = CreateProgram(); + auto waiter_kernel = CreateKernel( + waiter_program, + "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_remote_waiter.cpp", + waiter_core, + EthernetConfig{.noc = NOC::RISCV_0_default, .processor = DataMovementProcessor::RISCV_0}); + std::array waiter_rt_args = { + global_sem->address(), + incrementer_cores.num_cores(), + syncer_core_physical.x, + syncer_core_physical.y, + tensix_waiter_core_physical.x, + tensix_waiter_core_physical.y, + eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE}; + SetRuntimeArgs(waiter_program, waiter_kernel, waiter_core, waiter_rt_args); + + Program syncer_program = CreateProgram(); + auto syncer_kernel = CreateKernel( + syncer_program, + "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/syncer.cpp", + syncer_core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + std::array syncer_rt_args = {global_sem->address()}; + SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args); + + Program incrementer_program = CreateProgram(); + auto incrementer_kernel = CreateKernel( + incrementer_program, + "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/incrementer.cpp", + incrementer_cores, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + std::array incrementer_rt_args = { + global_sem->address(), tensix_waiter_core_physical.x, tensix_waiter_core_physical.y}; + SetRuntimeArgs(incrementer_program, incrementer_kernel, incrementer_cores, incrementer_rt_args); + return { + std::move(waiter_program), std::move(syncer_program), std::move(incrementer_program), std::move(global_sem)}; +} diff --git a/tests/tt_metal/tt_metal/eth/CMakeLists.txt b/tests/tt_metal/tt_metal/eth/CMakeLists.txt new file mode 100644 index 00000000000..633c597f9f5 --- /dev/null +++ b/tests/tt_metal/tt_metal/eth/CMakeLists.txt @@ -0,0 +1,28 @@ +set(UNIT_TESTS_ETH_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/test_basic_eth.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_buffer_movement_kernels.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_erisc_app_direct_send.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_ring_gather_kernels.cpp +) + +add_executable(unit_tests_eth ${UNIT_TESTS_ETH_SRC}) +TT_ENABLE_UNITY_BUILD(unit_tests_eth) + +target_link_libraries(unit_tests_eth PUBLIC test_metal_common_libs) +target_include_directories( + unit_tests_eth + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) +set_target_properties( + unit_tests_eth + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) diff --git a/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp new file mode 100644 index 00000000000..8a871c3dd87 --- /dev/null +++ b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp @@ -0,0 +1,454 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "command_queue_fixture.hpp" +#include "device_fixture.hpp" +#include "dispatch_fixture.hpp" +#include "multi_device_fixture.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "host_api.hpp" +#include "tt_metal/impl/kernels/kernel.hpp" +#include "tt_metal/test_utils/stimulus.hpp" + +using namespace tt; +using namespace tt::test_utils; +// using namespace tt::test_utils::df; + +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { +constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet +constexpr std::int32_t MAX_NUM_WORDS = + (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE; +} +} + +namespace unit_tests::erisc::kernels { + + + +/* + * ███╗░░██╗░█████╗░░█████╗░ + * ████╗░██║██╔══██╗██╔══██╗ + * ██╔██╗██║██║░░██║██║░░╚═╝ + * ██║╚████║██║░░██║██║░░██╗ + * ██║░╚███║╚█████╔╝╚█████╔╝ + * ╚═╝░░╚══╝░╚════╝░░╚════╝░ + */ + +bool reader_kernel_no_send( + DispatchFixture* fixture, + tt_metal::Device* device, + const size_t& byte_size, + const size_t& eth_l1_byte_address, + const CoreCoord& eth_reader_core, + const tt_metal::EthernetConfig ðernet_config = tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}) { + bool pass = true; + //////////////////////////////////////////////////////////////////////////// + // Application Setup + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program program = tt_metal::Program(); + + tt::tt_metal::InterleavedBufferConfig dram_config{ + .device=device, + .size = byte_size, + .page_size = byte_size, + .buffer_type = tt::tt_metal::BufferType::DRAM + }; + + auto input_dram_buffer = CreateBuffer(dram_config); + uint32_t dram_byte_address = input_dram_buffer->address(); + auto dram_noc_xy = input_dram_buffer->noc_coordinates(); + auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_reader_core); + log_debug( + tt::LogTest, + "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}", + device->id(), + byte_size, + dram_noc_xy.str(), + dram_byte_address, + eth_reader_core.str(), + eth_l1_byte_address); + + auto eth_reader_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp", + eth_reader_core, + ethernet_config); + + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + + auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); + fixture->WriteBuffer(device, input_dram_buffer, inputs); + + // Clear expected value at ethernet L1 address + std::vector all_zeros(inputs.size(), 0); + llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_l1_byte_address); + + tt_metal::SetRuntimeArgs( + program, + eth_reader_kernel, + eth_reader_core, + { + (uint32_t)dram_byte_address, + (uint32_t)dram_noc_xy.x, + (uint32_t)dram_noc_xy.y, + (uint32_t)byte_size, + (uint32_t)eth_l1_byte_address, + }); + + fixture->RunProgram(device, program); + + auto readback_vec = llrt::read_hex_vec_from_core(device->id(), eth_noc_xy, eth_l1_byte_address, byte_size); + pass &= (readback_vec == inputs); + if (not pass) { + std::cout << "Mismatch at Core: " << eth_noc_xy.str() << std::endl; + } + return pass; +} + +bool writer_kernel_no_receive( + DispatchFixture* fixture, + tt_metal::Device* device, + const size_t& byte_size, + const size_t& eth_l1_byte_address, + const CoreCoord& eth_writer_core, + const tt_metal::EthernetConfig ðernet_config = tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}) { + bool pass = true; + //////////////////////////////////////////////////////////////////////////// + // Application Setup + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program program = tt_metal::Program(); + + tt::tt_metal::InterleavedBufferConfig dram_config{ + .device=device, + .size = byte_size, + .page_size = byte_size, + .buffer_type = tt::tt_metal::BufferType::DRAM + }; + + auto output_dram_buffer = CreateBuffer(dram_config); + uint32_t dram_byte_address = output_dram_buffer->address(); + auto dram_noc_xy = output_dram_buffer->noc_coordinates(); + auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_writer_core); + log_debug( + tt::LogTest, + "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}", + device->id(), + byte_size, + eth_writer_core.str(), + eth_l1_byte_address, + dram_noc_xy.str(), + dram_byte_address); + + auto eth_writer_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp", + eth_writer_core, + ethernet_config); + + //////////////////////////////////////////////////////////////////////////// + // Compile and Execute Application + //////////////////////////////////////////////////////////////////////////// + + auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); + llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, inputs, eth_l1_byte_address); + + // Clear expected value at ethernet L1 address + std::vector all_zeros(inputs.size(), 0); + fixture->WriteBuffer(device, output_dram_buffer, all_zeros); + + tt_metal::SetRuntimeArgs( + program, + eth_writer_kernel, + eth_writer_core, + { + (uint32_t)dram_byte_address, + (uint32_t)dram_noc_xy.x, + (uint32_t)dram_noc_xy.y, + (uint32_t)byte_size, + (uint32_t)eth_l1_byte_address, + }); + + fixture->RunProgram(device, program); + + auto readback_vec = llrt::read_hex_vec_from_core(device->id(), dram_noc_xy, dram_byte_address, byte_size); + pass &= (readback_vec == inputs); + if (not pass) { + std::cout << "Mismatch at Core: " << dram_noc_xy.str() << std::endl; + } + return pass; +} + +bool noc_reader_and_writer_kernels( + tt_metal::Device *device, + const uint32_t byte_size, + const uint32_t eth_dst_l1_address, + const uint32_t eth_src_l1_address, + const CoreCoord &logical_eth_core, + const tt_metal::EthernetConfig &reader_eth_config, + const tt_metal::EthernetConfig &writer_eth_config) { + bool pass = true; + + tt_metal::Program program = tt_metal::Program(); + + tt_metal::InterleavedBufferConfig dram_config{ + .device=device, + .size = byte_size, + .page_size = byte_size, + .buffer_type = tt_metal::BufferType::DRAM + }; + + auto reader_dram_buffer = CreateBuffer(dram_config); + auto writer_dram_buffer = CreateBuffer(dram_config); + + auto reader_dram_noc_xy = reader_dram_buffer->noc_coordinates(); + auto writer_dram_noc_xy = writer_dram_buffer->noc_coordinates(); + + log_debug( + tt::LogTest, + "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}", + device->id(), + byte_size, + reader_dram_noc_xy.str(), + reader_dram_buffer->address(), + logical_eth_core.str(), + eth_dst_l1_address); + log_debug( + tt::LogTest, + "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}", + device->id(), + byte_size, + logical_eth_core.str(), + eth_src_l1_address, + writer_dram_noc_xy.str(), + writer_dram_buffer->address()); + + auto eth_noc_xy = device->ethernet_core_from_logical_core(logical_eth_core); + + auto eth_reader_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp", + logical_eth_core, + reader_eth_config); + + tt_metal::SetRuntimeArgs( + program, + eth_reader_kernel, + logical_eth_core, + { + (uint32_t)reader_dram_buffer->address(), + (uint32_t)reader_dram_noc_xy.x, + (uint32_t)reader_dram_noc_xy.y, + (uint32_t)byte_size, + (uint32_t)eth_dst_l1_address, + }); + + auto eth_writer_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp", + logical_eth_core, + writer_eth_config); + + tt_metal::SetRuntimeArgs( + program, + eth_writer_kernel, + logical_eth_core, + { + (uint32_t)writer_dram_buffer->address(), + (uint32_t)writer_dram_noc_xy.x, + (uint32_t)writer_dram_noc_xy.y, + (uint32_t)byte_size, + (uint32_t)eth_src_l1_address, + }); + + auto reader_inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); + tt_metal::detail::WriteToBuffer(reader_dram_buffer, reader_inputs); + + auto writer_inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); + llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, writer_inputs, eth_src_l1_address); + + // Clear expected values at output locations + std::vector all_zeros(byte_size / sizeof(uint32_t), 0); + llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_dst_l1_address); + tt_metal::detail::WriteToBuffer(writer_dram_buffer, all_zeros); + + tt_metal::detail::LaunchProgram(device, program); + + auto eth_readback_vec = llrt::read_hex_vec_from_core(device->id(), eth_noc_xy, eth_dst_l1_address, byte_size); + pass &= (eth_readback_vec == reader_inputs); + if (not pass) { + log_info(tt::LogTest, "Mismatch at eth core: {}, eth kernel read incorrect values from DRAM", logical_eth_core.str()); + } + std::vector dram_readback_vec; + tt_metal::detail::ReadFromBuffer(writer_dram_buffer, dram_readback_vec); + pass &= (dram_readback_vec == writer_inputs); + if (not pass) { + log_info(tt::LogTest, "Mismatch at eth core: {}, eth kernel wrote incorrect values to DRAM", logical_eth_core.str()); + } + + return pass; +} + +} // namespace unit_tests::erisc::kernels + +TEST_F(CommandQueueSingleCardProgramFixture, ActiveEthKernelsNocReadNoSend) { + using namespace CMAKE_UNIQUE_NAMESPACE; + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + + for (const auto& device : devices_) { + for (const auto& eth_core : device->get_active_ethernet_cores(true)) { + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device, WORD_SIZE, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); + } + } +} + +TEST_F(CommandQueueSingleCardProgramFixture, ActiveEthKernelsNocWriteNoReceive) { + using namespace CMAKE_UNIQUE_NAMESPACE; + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + + for (const auto& device : devices_) { + for (const auto& eth_core : device->get_active_ethernet_cores(true)) { + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device, WORD_SIZE, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); + } + } +} + +TEST_F(N300DeviceFixture, ActiveEthKernelsNocReadNoSend) { + using namespace CMAKE_UNIQUE_NAMESPACE; + GTEST_SKIP(); + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + + for (const auto& eth_core : device_0->get_active_ethernet_cores(true)) { + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device_0, WORD_SIZE, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device_0, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device_0, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); + } + + for (const auto& eth_core : device_1->get_active_ethernet_cores(true)) { + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device_1, WORD_SIZE, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device_1, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device_1, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); + } +} + +TEST_F(N300DeviceFixture, ActiveEthKernelsNocWriteNoReceive) { + using namespace CMAKE_UNIQUE_NAMESPACE; + GTEST_SKIP(); + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + + for (const auto& eth_core : device_0->get_active_ethernet_cores(true)) { + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device_0, WORD_SIZE, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device_0, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device_0, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); + } + + for (const auto& eth_core : device_1->get_active_ethernet_cores(true)) { + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device_1, WORD_SIZE, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device_1, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device_1, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); + } +} + +/* + * + * ███████╗████████╗██╗░░██╗ + * ██╔════╝╚══██╔══╝██║░░██║ + * █████╗░░░░░██║░░░███████║ + * ██╔══╝░░░░░██║░░░██╔══██║ + * ███████╗░░░██║░░░██║░░██║ + * ╚══════╝░░░╚═╝░░░╚═╝░░╚═╝ + */ + + + + + +// TODO #14640: Run this on WH when i$ flush issue is addressed +TEST_F(BlackholeSingleCardFixture, IdleEthKernelOnIdleErisc0) { + using namespace CMAKE_UNIQUE_NAMESPACE; + uint32_t eth_l1_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED); + tt_metal::EthernetConfig noc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_0}; + tt_metal::EthernetConfig noc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_1, .processor = tt_metal::DataMovementProcessor::RISCV_0}; + + for (const auto& eth_core : device_->get_inactive_ethernet_cores()) { + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config)); + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config)); + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config)); + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config)); + } +} + +TEST_F(BlackholeSingleCardFixture, IdleEthKernelOnIdleErisc1) { + using namespace CMAKE_UNIQUE_NAMESPACE; + uint32_t eth_l1_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED); + tt_metal::EthernetConfig noc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_1}; + tt_metal::EthernetConfig noc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_1, .processor = tt_metal::DataMovementProcessor::RISCV_1}; + + for (const auto& eth_core : device_->get_inactive_ethernet_cores()) { + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config)); + ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( + static_cast(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config)); + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config)); + ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( + static_cast(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config)); + } +} + +TEST_F(BlackholeSingleCardFixture, IdleEthKernelOnBothIdleEriscs) { + using namespace CMAKE_UNIQUE_NAMESPACE; + uint32_t read_write_size_bytes = WORD_SIZE * 2048; + uint32_t reader_dst_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED); + uint32_t writer_src_address = reader_dst_address + read_write_size_bytes; + tt_metal::EthernetConfig erisc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_0}; + tt_metal::EthernetConfig erisc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_1}; + + for (const auto& eth_core : device_->get_inactive_ethernet_cores()) { + ASSERT_TRUE(unit_tests::erisc::kernels::noc_reader_and_writer_kernels( + device_, read_write_size_bytes, reader_dst_address, writer_src_address, eth_core, erisc0_ethernet_config, erisc1_ethernet_config + )); + erisc0_ethernet_config.noc = tt_metal::NOC::NOC_1; + erisc1_ethernet_config.noc = tt_metal::NOC::NOC_1; + ASSERT_TRUE(unit_tests::erisc::kernels::noc_reader_and_writer_kernels( + device_, read_write_size_bytes, reader_dst_address, writer_src_address, eth_core, erisc0_ethernet_config, erisc1_ethernet_config + )); + } +} diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/buffer_movement_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp similarity index 62% rename from tests/tt_metal/tt_metal/unit_tests/ethernet/buffer_movement_kernels.cpp rename to tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp index dd9c95aab8f..746912b90aa 100644 --- a/tests/tt_metal/tt_metal/unit_tests/ethernet/buffer_movement_kernels.cpp +++ b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp @@ -3,26 +3,23 @@ // SPDX-License-Identifier: Apache-2.0 #include - -#include -#include -#include +#include #include "device_fixture.hpp" -#include "n300_device_fixture.hpp" +#include "command_queue_fixture.hpp" +#include "dispatch_fixture.hpp" +#include "multi_device_fixture.hpp" #include "tt_metal/common/math.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/impl/kernels/kernel.hpp" -#include "tt_metal/test_utils/comparison.hpp" -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" using namespace tt; using namespace tt::test_utils; -using namespace tt::test_utils::df; +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { constexpr std::int32_t MAX_BUFFER_SIZE = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); @@ -30,14 +27,17 @@ struct BankedConfig { size_t num_pages = 1; size_t size_bytes = 1 * 2 * 32 * 32; size_t page_size_bytes = 2 * 32 * 32; - BufferType input_buffer_type = BufferType::L1; - BufferType output_buffer_type = BufferType::L1; + tt_metal::BufferType input_buffer_type = tt_metal::BufferType::L1; + tt_metal::BufferType output_buffer_type = tt_metal::BufferType::L1; tt::DataFormat l1_data_format = tt::DataFormat::Float16_b; }; +} // namespace CMAKE_UNIQUE_NAMESPACE +} namespace unit_tests::erisc::kernels { bool chip_to_chip_dram_buffer_transfer( + DispatchFixture* fixture, tt_metal::Device* sender_device, tt_metal::Device* receiver_device, const CoreCoord& eth_sender_core, @@ -86,7 +86,7 @@ bool chip_to_chip_dram_buffer_transfer( // Generate inputs auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); - tt_metal::detail::WriteToBuffer(input_dram_buffer, inputs); + fixture->WriteBuffer(sender_device, input_dram_buffer, inputs); const uint32_t MAX_BUFFER = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); @@ -95,7 +95,7 @@ bool chip_to_chip_dram_buffer_transfer( // Clear expected value at ethernet L1 address std::vector all_zeros(inputs.size(), 0); - tt_metal::detail::WriteToBuffer(output_dram_buffer, all_zeros); + fixture->WriteBuffer(receiver_device, output_dram_buffer, all_zeros); //////////////////////////////////////////////////////////////////////////// // Sender Device @@ -148,14 +148,26 @@ bool chip_to_chip_dram_buffer_transfer( //////////////////////////////////////////////////////////////////////////// // Execute Programs //////////////////////////////////////////////////////////////////////////// + std::thread t1; + std::thread t2; + if (fixture->IsSlowDispatch()) { + t1 = std::thread([&]() { fixture->RunProgram(sender_device, sender_program); }); + t2 = std::thread([&]() { fixture->RunProgram(receiver_device, receiver_program); }); + } else { + fixture->RunProgram(sender_device, sender_program, true); + fixture->RunProgram(receiver_device, receiver_program, true); + } - std::thread th1 = std::thread([&] { tt_metal::detail::LaunchProgram(sender_device, sender_program); }); - std::thread th2 = std::thread([&] { tt_metal::detail::LaunchProgram(receiver_device, receiver_program); }); + fixture->FinishCommands(sender_device); + fixture->FinishCommands(receiver_device); + + if (fixture->IsSlowDispatch()) { + t1.join(); + t2.join(); + } - th1.join(); - th2.join(); std::vector dest_dram_data; - tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_dram_data); + fixture->ReadBuffer(receiver_device, output_dram_buffer, dest_dram_data); pass &= (dest_dram_data == inputs); if (not pass) { std::cout << "Mismatch at Core: " << output_dram_noc_xy.str() << std::endl; @@ -165,15 +177,15 @@ bool chip_to_chip_dram_buffer_transfer( } bool chip_to_chip_interleaved_buffer_transfer( + DispatchFixture* fixture, tt_metal::Device* sender_device, tt_metal::Device* receiver_device, const CoreCoord& eth_sender_core, const CoreCoord& eth_receiver_core, - const BankedConfig& cfg, + const CMAKE_UNIQUE_NAMESPACE::BankedConfig& cfg, const uint32_t& max_transfer_size) { bool pass = true; - const uint32_t input0_cb_index = 0; const uint32_t output_cb_index = 16; @@ -206,7 +218,7 @@ bool chip_to_chip_interleaved_buffer_transfer( auto input_buffer = CreateBuffer(sender_config); bool input_is_dram = cfg.input_buffer_type == BufferType::DRAM; - tt_metal::detail::WriteToBuffer(input_buffer, input_packed); + fixture->WriteBuffer(sender_device, input_buffer, input_packed); const uint32_t max_buffer = round_down(max_transfer_size, cfg.page_size_bytes); uint32_t pages_per_loop = max_buffer / cfg.page_size_bytes; @@ -242,6 +254,7 @@ bool chip_to_chip_interleaved_buffer_transfer( std::vector all_zeros(cfg.size_bytes / sizeof(uint32_t), 0); tt_metal::detail::WriteToBuffer(output_buffer, all_zeros); + fixture->WriteBuffer(receiver_device, output_buffer, all_zeros); auto eth_receiver_kernel = tt_metal::CreateKernel( receiver_program, @@ -266,21 +279,34 @@ bool chip_to_chip_interleaved_buffer_transfer( //////////////////////////////////////////////////////////////////////////// // Execute Programs //////////////////////////////////////////////////////////////////////////// + std::thread t1; + std::thread t2; + if (fixture->IsSlowDispatch()) { + t1 = std::thread([&]() { fixture->RunProgram(sender_device, sender_program); }); + t2 = std::thread([&]() { fixture->RunProgram(receiver_device, receiver_program); }); + } else { + fixture->RunProgram(sender_device, sender_program, true); + fixture->RunProgram(receiver_device, receiver_program, true); + } + + fixture->FinishCommands(sender_device); + fixture->FinishCommands(receiver_device); - std::thread th1 = std::thread([&] { tt_metal::detail::LaunchProgram(sender_device, sender_program); }); - std::thread th2 = std::thread([&] { tt_metal::detail::LaunchProgram(receiver_device, receiver_program); }); + if (fixture->IsSlowDispatch()) { + t1.join(); + t2.join(); + } - th1.join(); - th2.join(); std::vector dest_buffer_data; tt_metal::detail::ReadFromBuffer(output_buffer, dest_buffer_data); + fixture->ReadBuffer(receiver_device, output_buffer, dest_buffer_data); pass &= input_packed == dest_buffer_data; return pass; } } // namespace unit_tests::erisc::kernels -TEST_F(N300DeviceFixture, EthKernelsSendDramBufferChip0ToChip1) { +TEST_F(N300DeviceFixture, ActiveEthKernelsSendDramBufferChip0ToChip1) { const auto& sender_device = devices_.at(0); const auto& receiver_device = devices_.at(1); @@ -288,17 +314,17 @@ TEST_F(N300DeviceFixture, EthKernelsSendDramBufferChip0ToChip1) { CoreCoord receiver_eth_core = std::get<1>(sender_device->get_connected_ethernet_core(sender_eth_core)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16)); + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024)); + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024)); + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024)); + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024)); } } -TEST_F(N300DeviceFixture, EthKernelsSendDramBufferChip1ToChip0) { +TEST_F(N300DeviceFixture, ActiveEthKernelsSendDramBufferChip1ToChip0) { const auto& sender_device = devices_.at(1); const auto& receiver_device = devices_.at(0); @@ -306,17 +332,18 @@ TEST_F(N300DeviceFixture, EthKernelsSendDramBufferChip1ToChip0) { CoreCoord receiver_eth_core = std::get<1>(sender_device->get_connected_ethernet_core(sender_eth_core)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16)); + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024)); + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024)); + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024)); + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024)); } } -TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) { +TEST_F(N300DeviceFixture, ActiveEthKernelsSendInterleavedBufferChip0ToChip1) { + using namespace CMAKE_UNIQUE_NAMESPACE; GTEST_SKIP(); const auto& sender_device = devices_.at(0); const auto& receiver_device = devices_.at(1); @@ -333,6 +360,7 @@ TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) { receiver_eth_core.str()); BankedConfig test_config; ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + static_cast(this), sender_device, receiver_device, sender_eth_core, @@ -342,6 +370,7 @@ TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) { test_config = BankedConfig{.num_pages = 200, .size_bytes = 200 * 2 * 32 * 32, .page_size_bytes = 2 * 32 * 32}; ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + static_cast(this), sender_device, receiver_device, sender_eth_core, @@ -349,7 +378,7 @@ TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) { test_config, test_config.page_size_bytes)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); test_config = BankedConfig{ .num_pages = 200, .size_bytes = 200 * 2 * 32 * 32, @@ -357,6 +386,7 @@ TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) { .input_buffer_type = BufferType::DRAM, .output_buffer_type = BufferType::DRAM}; ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + static_cast(this), sender_device, receiver_device, sender_eth_core, @@ -364,11 +394,12 @@ TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) { test_config, test_config.page_size_bytes)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); } } -TEST_F(DeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) { +TEST_F(DeviceFixture, ActiveEthKernelsSendInterleavedBufferAllConnectedChips) { + using namespace CMAKE_UNIQUE_NAMESPACE; for (const auto& sender_device : devices_) { for (const auto& receiver_device : devices_) { if (sender_device->id() == receiver_device->id()) { @@ -395,6 +426,97 @@ TEST_F(DeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) { .output_buffer_type = BufferType::DRAM}; ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + static_cast(this), + sender_device, + receiver_device, + sender_eth_core, + receiver_eth_core, + test_config, + test_config.page_size_bytes)); + ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); + test_config = BankedConfig{ + .num_pages = 200, + .size_bytes = 200 * 2 * 32 * 32, + .page_size_bytes = 2 * 32 * 32, + .input_buffer_type = BufferType::DRAM, + .output_buffer_type = BufferType::L1}; + ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + static_cast(this), + sender_device, + receiver_device, + sender_eth_core, + receiver_eth_core, + test_config, + test_config.page_size_bytes)); + ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); + } + } + } +} + +TEST_F(CommandQueueMultiDeviceProgramFixture, ActiveEthKernelsSendDramBufferAllConnectedChips) { + for (const auto& sender_device : devices_) { + for (const auto& receiver_device : devices_) { + if (sender_device->id() >= receiver_device->id()) { + continue; + } + for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) { + auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core); + if (receiver_device->id() != device_id) { + continue; + } + log_info( + tt::LogTest, + "Sending dram buffer from device {} to device {}, using eth core {} and {}", + sender_device->id(), + receiver_device->id(), + sender_eth_core.str(), + receiver_eth_core.str()); + + ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16)); + ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024)); + ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024)); + ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024)); + } + } + } +} + +TEST_F(CommandQueueMultiDeviceProgramFixture, ActiveEthKernelsSendInterleavedBufferAllConnectedChips) { + using namespace CMAKE_UNIQUE_NAMESPACE; + for (const auto& sender_device : devices_) { + for (const auto& receiver_device : devices_) { + if (sender_device->id() >= receiver_device->id()) { + continue; + } + for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) { + auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core); + if (receiver_device->id() != device_id) { + continue; + } + + log_info( + tt::LogTest, + "Sending interleaved buffer from device {} to device {}, using eth core {} and {}", + sender_device->id(), + receiver_device->id(), + sender_eth_core.str(), + receiver_eth_core.str()); + BankedConfig test_config = BankedConfig{ + .num_pages = 200, + .size_bytes = 200 * 2 * 32 * 32, + .page_size_bytes = 2 * 32 * 32, + .input_buffer_type = BufferType::L1, + .output_buffer_type = BufferType::DRAM}; + + ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + static_cast(this), sender_device, receiver_device, sender_eth_core, @@ -402,7 +524,7 @@ TEST_F(DeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) { test_config, test_config.page_size_bytes)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); + static_cast(this),sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); test_config = BankedConfig{ .num_pages = 200, .size_bytes = 200 * 2 * 32 * 32, @@ -410,6 +532,7 @@ TEST_F(DeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) { .input_buffer_type = BufferType::DRAM, .output_buffer_type = BufferType::L1}; ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( + static_cast(this), sender_device, receiver_device, sender_eth_core, @@ -417,7 +540,7 @@ TEST_F(DeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) { test_config, test_config.page_size_bytes)); ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); + static_cast(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); } } } diff --git a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp new file mode 100644 index 00000000000..84114813967 --- /dev/null +++ b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp @@ -0,0 +1,835 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include + +#include "device_fixture.hpp" +#include "dispatch_fixture.hpp" +#include "multi_device_fixture.hpp" +#include "command_queue_fixture.hpp" +#include "tt_metal/common/logger.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/host_api.hpp" +#include "tt_metal/test_utils/stimulus.hpp" + +namespace { +namespace CMAKE_UNIQUE_NAMESPACE { +constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet +constexpr std::int32_t MAX_NUM_WORDS = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_SIZE / WORD_SIZE; + +struct erisc_info_t { + volatile uint32_t num_bytes; + volatile uint32_t mode; + volatile uint32_t reserved_0_; + volatile uint32_t reserved_1_; + volatile uint32_t bytes_done; + volatile uint32_t reserverd_2_; + volatile uint32_t reserverd_3_; + volatile uint32_t reserverd_4_; +}; +} +} + +using namespace tt; +using namespace tt::test_utils; + +namespace unit_tests::erisc::direct_send { +const size_t get_rand_32_byte_aligned_address(const size_t& base, const size_t& max) { + TT_ASSERT(!(base & 0x1F) and !(max & 0x1F)); + size_t word_size = (max >> 5) - (base >> 5); + return (((rand() % word_size) << 5) + base); +} + +bool eth_direct_sender_receiver_kernels( + DispatchFixture* fixture, + tt_metal::Device* sender_device, + tt_metal::Device* receiver_device, + const size_t& byte_size, + const size_t& src_eth_l1_byte_address, + const size_t& dst_eth_l1_byte_address, + const CoreCoord& eth_sender_core, + const CoreCoord& eth_receiver_core, + uint32_t num_bytes_per_send = 16) { + bool pass = true; + log_debug( + tt::LogTest, + "Sending {} bytes from device {} eth core {} addr {} to device {} eth core {} addr {}", + byte_size, + sender_device->id(), + eth_sender_core.str(), + src_eth_l1_byte_address, + receiver_device->id(), + eth_receiver_core.str(), + dst_eth_l1_byte_address); + // Generate inputs + auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); + llrt::write_hex_vec_to_core( + sender_device->id(), + sender_device->ethernet_core_from_logical_core(eth_sender_core), + inputs, + src_eth_l1_byte_address); + + // Clear expected value at ethernet L1 address + std::vector all_zeros(inputs.size(), 0); + llrt::write_hex_vec_to_core( + receiver_device->id(), + receiver_device->ethernet_core_from_logical_core(eth_receiver_core), + all_zeros, + dst_eth_l1_byte_address); + + //////////////////////////////////////////////////////////////////////////// + // Sender Device + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program sender_program = tt_metal::Program(); + + auto eth_sender_kernel = tt_metal::CreateKernel( + sender_program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_send.cpp", + eth_sender_core, + tt_metal::EthernetConfig{ + .noc = tt_metal::NOC::NOC_0, + .compile_args = {uint32_t(num_bytes_per_send), uint32_t(num_bytes_per_send >> 4)}}); + + tt_metal::SetRuntimeArgs( + sender_program, + eth_sender_kernel, + eth_sender_core, + { + (uint32_t)src_eth_l1_byte_address, + (uint32_t)dst_eth_l1_byte_address, + (uint32_t)byte_size, + }); + + //////////////////////////////////////////////////////////////////////////// + // Receiver Device + //////////////////////////////////////////////////////////////////////////// + tt_metal::Program receiver_program = tt_metal::Program(); + + auto eth_receiver_kernel = tt_metal::CreateKernel( + receiver_program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_receive.cpp", + eth_receiver_core, + tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}); // probably want to use NOC_1 here + + tt_metal::SetRuntimeArgs( + receiver_program, + eth_receiver_kernel, + eth_receiver_core, + { + (uint32_t)byte_size, + }); + + //////////////////////////////////////////////////////////////////////////// + // Execute Programs + //////////////////////////////////////////////////////////////////////////// + std::thread t1; + std::thread t2; + if (fixture->IsSlowDispatch()) { + t1 = std::thread([&]() { fixture->RunProgram(sender_device, sender_program); }); + t2 = std::thread([&]() { fixture->RunProgram(receiver_device, receiver_program); }); + } else { + fixture->RunProgram(sender_device, sender_program, true); + fixture->RunProgram(receiver_device, receiver_program, true); + } + + fixture->FinishCommands(sender_device); + fixture->FinishCommands(receiver_device); + + if (fixture->IsSlowDispatch()) { + t1.join(); + t2.join(); + } + + auto readback_vec = llrt::read_hex_vec_from_core( + receiver_device->id(), + receiver_device->ethernet_core_from_logical_core(eth_receiver_core), + dst_eth_l1_byte_address, + byte_size); + pass &= (readback_vec == inputs); + if (not pass) { + std::cout << "Mismatch at Core: " << eth_receiver_core.str() << std::endl; + std::cout << readback_vec[0] << std::endl; + } + return pass; +} + +// Tests ethernet direct send/receive from ERISC_L1_UNRESERVED_BASE +bool send_over_eth( + tt_metal::Device* sender_device, + tt_metal::Device* receiver_device, + const CoreCoord& sender_core, + const CoreCoord& receiver_core, + const size_t& byte_size) { + tt::log_debug( + tt::LogTest, + "Running direct send test with sender chip {} core {}, receiver chip {} core {}, sending {} bytes", + sender_device->id(), + sender_core.str(), + receiver_device->id(), + receiver_core.str(), + byte_size); + std::vector eth_cores = { + CoreCoord(9, 0), + CoreCoord(1, 0), + CoreCoord(8, 0), + CoreCoord(2, 0), + CoreCoord(9, 6), + CoreCoord(1, 6), + CoreCoord(8, 6), + CoreCoord(2, 6), + CoreCoord(7, 0), + CoreCoord(3, 0), + CoreCoord(6, 0), + CoreCoord(4, 0), + CoreCoord(7, 6), + CoreCoord(3, 6), + CoreCoord(6, 6), + CoreCoord(4, 6)}; + + // Disable all eth core runtime app flags, zero out data write counter + std::vector run_test_app_flag = {0x0}; + for (const auto& eth_core : eth_cores) { + llrt::write_hex_vec_to_core( + sender_device->id(), eth_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); + llrt::write_hex_vec_to_core( + receiver_device->id(), eth_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); + std::vector zero = {0, 0, 0, 0, 0, 0, 0, 0}; + llrt::write_hex_vec_to_core( + sender_device->id(), eth_core, zero, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); + llrt::write_hex_vec_to_core( + receiver_device->id(), eth_core, zero, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); + } + + // TODO: is it possible that receiver core app is stil running when we push inputs here??? + auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); + llrt::write_hex_vec_to_core( + sender_device->id(), sender_core, inputs, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); + + // Zero out receiving address to ensure no stale data is causing tests to pass + std::vector all_zeros(inputs.size(), 0); + llrt::write_hex_vec_to_core( + receiver_device->id(), receiver_core, all_zeros, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); + + std::vector args_0 = {uint32_t(byte_size), 0}; + llrt::write_hex_vec_to_core(sender_device->id(), sender_core, args_0, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); + std::vector args_1 = {uint32_t(byte_size), 1}; + llrt::write_hex_vec_to_core(receiver_device->id(), receiver_core, args_1, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); + + // TODO: this should be updated to use kernel api + uint32_t active_eth_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); + ll_api::memory binary_mem_send = llrt::get_risc_binary( + sender_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0); + ll_api::memory binary_mem_receive = llrt::get_risc_binary( + receiver_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0); + + for (const auto& eth_core : eth_cores) { + llrt::write_hex_vec_to_core( + sender_device->id(), eth_core, binary_mem_send.data(), eth_l1_mem::address_map::FIRMWARE_BASE); + llrt::write_hex_vec_to_core( + receiver_device->id(), eth_core, binary_mem_receive.data(), eth_l1_mem::address_map::FIRMWARE_BASE); + } + + // Activate sender core runtime app + run_test_app_flag = {0x1}; + // send remote first, otherwise eth core may be blocked, very ugly for now... + if (receiver_device->id() == 1) { + llrt::write_hex_vec_to_core( + 1, receiver_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); + } else { + llrt::write_hex_vec_to_core(1, sender_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); + } + if (sender_device->id() == 0) { + llrt::write_hex_vec_to_core(0, sender_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); + } else { + llrt::write_hex_vec_to_core( + 0, receiver_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); + } + + bool pass = true; + auto readback_vec = llrt::read_hex_vec_from_core( + receiver_device->id(), receiver_core, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, byte_size); + pass &= (readback_vec == inputs); + + return pass; +} + +} // namespace unit_tests::erisc::direct_send + +TEST_F(N300DeviceFixture, ActiveEthSingleCoreDirectSendChip0ToChip1) { + using namespace CMAKE_UNIQUE_NAMESPACE; + GTEST_SKIP(); + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + CoreCoord sender_core_0 = CoreCoord(9, 6); + CoreCoord sender_core_1 = CoreCoord(1, 6); + + CoreCoord receiver_core_0 = CoreCoord(9, 0); + CoreCoord receiver_core_1 = CoreCoord(1, 0); + + ASSERT_TRUE( + unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE)); + ASSERT_TRUE( + unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 256)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 256)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 1024)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 1024)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS)); +} + +TEST_F(N300DeviceFixture, ActiveEthSingleCoreDirectSendChip1ToChip0) { + using namespace CMAKE_UNIQUE_NAMESPACE; + GTEST_SKIP(); + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + CoreCoord sender_core_0 = CoreCoord(9, 0); + CoreCoord sender_core_1 = CoreCoord(1, 0); + + CoreCoord receiver_core_0 = CoreCoord(9, 6); + CoreCoord receiver_core_1 = CoreCoord(1, 6); + + ASSERT_TRUE( + unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE)); + ASSERT_TRUE( + unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * 256)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * 256)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * 1024)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * 1024)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS)); +} + +TEST_F(N300DeviceFixture, ActiveEthBidirectionalCoreDirectSend) { + using namespace CMAKE_UNIQUE_NAMESPACE; + GTEST_SKIP(); + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + CoreCoord sender_core_0 = CoreCoord(9, 6); + CoreCoord sender_core_1 = CoreCoord(1, 6); + + CoreCoord receiver_core_0 = CoreCoord(9, 0); + CoreCoord receiver_core_1 = CoreCoord(1, 0); + + ASSERT_TRUE( + unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE)); + ASSERT_TRUE( + unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE)); + ASSERT_TRUE( + unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE)); + ASSERT_TRUE( + unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 256)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * 256)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 256)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * 256)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 1024)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * 1024)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 1024)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * 1024)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * MAX_NUM_WORDS)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS)); + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * MAX_NUM_WORDS)); +} + +TEST_F(N300DeviceFixture, ActiveEthRandomDirectSendTests) { + using namespace CMAKE_UNIQUE_NAMESPACE; + GTEST_SKIP(); + srand(0); + + std::map, std::pair> connectivity = { + {{0, CoreCoord(9, 6)}, {1, CoreCoord(9, 0)}}, + {{1, CoreCoord(9, 0)}, {0, CoreCoord(9, 6)}}, + {{0, CoreCoord(1, 6)}, {1, CoreCoord(1, 0)}}, + {{1, CoreCoord(1, 0)}, {0, CoreCoord(1, 6)}}}; + for (int i = 0; i < 1000; i++) { + auto it = connectivity.begin(); + std::advance(it, rand() % (connectivity.size())); + + const auto& send_chip = devices_.at(std::get<0>(it->first)); + CoreCoord sender_core = std::get<1>(it->first); + const auto& receiver_chip = devices_.at(std::get<0>(it->second)); + CoreCoord receiver_core = std::get<1>(it->second); + int num_words = 0; + if constexpr (MAX_NUM_WORDS != 0) { + num_words = rand() % MAX_NUM_WORDS + 1; + } + + ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( + send_chip, receiver_chip, sender_core, receiver_core, WORD_SIZE * num_words)); + } +} + +TEST_F(N300DeviceFixture, ActiveEthKernelsDirectSendChip0ToChip1) { + using namespace CMAKE_UNIQUE_NAMESPACE; + GTEST_SKIP(); + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + + for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + auto [device_id, receiver_core] = device_0->get_connected_ethernet_core(sender_core); + if (device_1->id() != device_id) { + continue; + } + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_0, + device_1, + WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_0, + device_1, + 4 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_0, + device_1, + 256 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_0, + device_1, + 1000 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + } +} + +TEST_F(N300DeviceFixture, ActiveEthKernelsDirectSendChip1ToChip0) { + using namespace CMAKE_UNIQUE_NAMESPACE; + GTEST_SKIP(); + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + + for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) { + auto [device_id, receiver_core] = device_1->get_connected_ethernet_core(sender_core); + if (device_0->id() != device_id) { + continue; + } + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_1, + device_0, + WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_1, + device_0, + 4 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_1, + device_0, + 256 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_1, + device_0, + 1000 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + } +} + +TEST_F(DeviceFixture, ActiveEthKernelsDirectSendAllConnectedChips) { + using namespace CMAKE_UNIQUE_NAMESPACE; + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + for (const auto& sender_device : devices_) { + for (const auto& receiver_device : devices_) { + if (sender_device->id() == receiver_device->id()) { + continue; + } + for (const auto& sender_core : sender_device->get_active_ethernet_cores(true)) { + auto [device_id, receiver_core] = sender_device->get_connected_ethernet_core(sender_core); + if (receiver_device->id() != device_id) { + continue; + } + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + sender_device, + receiver_device, + WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + sender_device, + receiver_device, + 4 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + sender_device, + receiver_device, + 256 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + sender_device, + receiver_device, + 1000 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + } + } + } +} + +TEST_F(N300DeviceFixture, ActiveEthKernelsBidirectionalDirectSend) { + using namespace CMAKE_UNIQUE_NAMESPACE; + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + + for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_0, + device_1, + WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_1, + device_0, + WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + receiver_core, + sender_core)); + } + for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_0, + device_1, + WORD_SIZE * 256, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_1, + device_0, + WORD_SIZE * 256, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + receiver_core, + sender_core)); + } + for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_0, + device_1, + WORD_SIZE * 1024, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_1, + device_0, + WORD_SIZE * 1024, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + receiver_core, + sender_core)); + } + for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_0, + device_1, + WORD_SIZE * MAX_NUM_WORDS, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_1, + device_0, + WORD_SIZE * MAX_NUM_WORDS, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + receiver_core, + sender_core)); + } +} + +TEST_F(N300DeviceFixture, ActiveEthKernelsRepeatedDirectSends) { + using namespace CMAKE_UNIQUE_NAMESPACE; + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + + for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); + for (int i = 0; i < 10; i++) { + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_0, + device_1, + WORD_SIZE, + src_eth_l1_byte_address + WORD_SIZE * i, + dst_eth_l1_byte_address + WORD_SIZE * i, + sender_core, + receiver_core)); + } + for (int i = 0; i < 10; i++) { + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + device_1, + device_0, + WORD_SIZE, + src_eth_l1_byte_address + WORD_SIZE * i, + dst_eth_l1_byte_address + WORD_SIZE * i, + receiver_core, + sender_core)); + } + } +} + +TEST_F(N300DeviceFixture, ActiveEthKernelsRandomDirectSendTests) { + using namespace CMAKE_UNIQUE_NAMESPACE; + srand(0); + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + + std::map, std::tuple> connectivity = {}; + for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + const auto& receiver_core = device_0->get_connected_ethernet_core(sender_core); + connectivity.insert({{0, sender_core}, receiver_core}); + } + for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) { + const auto& receiver_core = device_1->get_connected_ethernet_core(sender_core); + connectivity.insert({{1, sender_core}, receiver_core}); + } + for (int i = 0; i < 1000; i++) { + auto it = connectivity.begin(); + std::advance(it, rand() % (connectivity.size())); + + const auto& send_chip = devices_.at(std::get<0>(it->first)); + CoreCoord sender_core = std::get<1>(it->first); + const auto& receiver_chip = devices_.at(std::get<0>(it->second)); + CoreCoord receiver_core = std::get<1>(it->second); + + const size_t src_eth_l1_byte_address = unit_tests::erisc::direct_send::get_rand_32_byte_aligned_address( + eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, eth_l1_mem::address_map::MAX_L1_LOADING_SIZE); + const size_t dst_eth_l1_byte_address = unit_tests::erisc::direct_send::get_rand_32_byte_aligned_address( + eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, eth_l1_mem::address_map::MAX_L1_LOADING_SIZE); + + int max_words = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - + std::max(src_eth_l1_byte_address, dst_eth_l1_byte_address)) / + WORD_SIZE; + int num_words = rand() % max_words + 1; + + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + send_chip, + receiver_chip, + WORD_SIZE * num_words, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + } +} +TEST_F(N300DeviceFixture, ActiveEthKernelsRandomEthPacketSizeDirectSendTests) { + srand(0); + const auto& device_0 = devices_.at(0); + const auto& device_1 = devices_.at(1); + + std::map, std::tuple> connectivity = {}; + for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { + const auto& receiver_core = device_0->get_connected_ethernet_core(sender_core); + connectivity.insert({{0, sender_core}, receiver_core}); + } + for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) { + const auto& receiver_core = device_1->get_connected_ethernet_core(sender_core); + connectivity.insert({{1, sender_core}, receiver_core}); + } + std::vector num_bytes_per_send_test_vals = { + 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}; + for (const auto& num_bytes_per_send : num_bytes_per_send_test_vals) { + log_info(tt::LogTest, "Random eth send tests with {} bytes per packet", num_bytes_per_send); + for (int i = 0; i < 10; i++) { + auto it = connectivity.begin(); + std::advance(it, rand() % (connectivity.size())); + + const auto& send_chip = devices_.at(std::get<0>(it->first)); + CoreCoord sender_core = std::get<1>(it->first); + const auto& receiver_chip = devices_.at(std::get<0>(it->second)); + CoreCoord receiver_core = std::get<1>(it->second); + + const size_t src_eth_l1_byte_address = unit_tests::erisc::direct_send::get_rand_32_byte_aligned_address( + eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, + eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - 65536); + const size_t dst_eth_l1_byte_address = unit_tests::erisc::direct_send::get_rand_32_byte_aligned_address( + eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, + eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - 65536); + + int max_words = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - + std::max(src_eth_l1_byte_address, dst_eth_l1_byte_address)) / + num_bytes_per_send; + int num_words = rand() % max_words + 1; + + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + send_chip, + receiver_chip, + num_bytes_per_send * num_words, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core, + num_bytes_per_send)); + } + } +} + +TEST_F(CommandQueueMultiDeviceProgramFixture, ActiveEthKernelsDirectSendAllConnectedChips) { + using namespace CMAKE_UNIQUE_NAMESPACE; + const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; + for (const auto& sender_device : devices_) { + for (const auto& receiver_device : devices_) { + if (sender_device->id() >= receiver_device->id()) { + continue; + } + for (const auto& sender_core : sender_device->get_active_ethernet_cores(true)) { + auto [device_id, receiver_core] = sender_device->get_connected_ethernet_core(sender_core); + if (receiver_device->id() != device_id) { + continue; + } + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + sender_device, + receiver_device, + WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + sender_device, + receiver_device, + 4 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + sender_device, + receiver_device, + 256 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels( + static_cast(this), + sender_device, + receiver_device, + 1000 * WORD_SIZE, + src_eth_l1_byte_address, + dst_eth_l1_byte_address, + sender_core, + receiver_core)); + } + } + } +} diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/ring_gather_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp similarity index 99% rename from tests/tt_metal/tt_metal/unit_tests/ethernet/ring_gather_kernels.cpp rename to tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp index 24ea1924810..03c022dca95 100644 --- a/tests/tt_metal/tt_metal/unit_tests/ethernet/ring_gather_kernels.cpp +++ b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp @@ -9,7 +9,7 @@ #include #include "device_fixture.hpp" -#include "n300_device_fixture.hpp" +#include "multi_device_fixture.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/impl/kernels/kernel.hpp" @@ -453,7 +453,7 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels( } // namespace unit_tests::erisc::kernels -TEST_F(DeviceFixture, EthKernelsDirectRingGatherAllChips) { +TEST_F(DeviceFixture, ActiveEthKernelsDirectRingGatherAllChips) { const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; const size_t sem_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; @@ -465,7 +465,7 @@ TEST_F(DeviceFixture, EthKernelsDirectRingGatherAllChips) { device_ring, WORD_SIZE, src_eth_l1_byte_address, dst_eth_l1_byte_address, sem_l1_byte_address)); } -TEST_F(DeviceFixture, EthKernelsInterleavedRingGatherAllChips) { +TEST_F(DeviceFixture, ActiveEthKernelsInterleavedRingGatherAllChips) { const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; const size_t sem_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; diff --git a/tests/tt_metal/tt_metal/gtest_smoke/test_basic_pipeline.cpp b/tests/tt_metal/tt_metal/gtest_smoke/test_basic_pipeline.cpp deleted file mode 120000 index 3584a28b68e..00000000000 --- a/tests/tt_metal/tt_metal/gtest_smoke/test_basic_pipeline.cpp +++ /dev/null @@ -1 +0,0 @@ -../unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp \ No newline at end of file diff --git a/tests/tt_metal/tt_metal/gtest_smoke/test_device.cpp b/tests/tt_metal/tt_metal/gtest_smoke/test_device.cpp deleted file mode 120000 index 9ae4e0133cd..00000000000 --- a/tests/tt_metal/tt_metal/gtest_smoke/test_device.cpp +++ /dev/null @@ -1 +0,0 @@ -../unit_tests_common/basic/test_device_init.cpp \ No newline at end of file diff --git a/tests/tt_metal/tt_metal/gtest_smoke/test_flatten.cpp b/tests/tt_metal/tt_metal/gtest_smoke/test_flatten.cpp deleted file mode 120000 index dae6734bc34..00000000000 --- a/tests/tt_metal/tt_metal/gtest_smoke/test_flatten.cpp +++ /dev/null @@ -1 +0,0 @@ -../unit_tests_common/compute/test_flatten.cpp \ No newline at end of file diff --git a/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_large_block.cpp deleted file mode 120000 index c649d4ab585..00000000000 --- a/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_large_block.cpp +++ /dev/null @@ -1 +0,0 @@ -../unit_tests_common/compute/matmul/test_matmul_large_block.cpp \ No newline at end of file diff --git a/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_multi_core_X_dram.cpp deleted file mode 120000 index 066de75928e..00000000000 --- a/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_multi_core_X_dram.cpp +++ /dev/null @@ -1 +0,0 @@ -../unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp \ No newline at end of file diff --git a/tests/tt_metal/tt_metal/gtest_smoke/tests_main.cpp b/tests/tt_metal/tt_metal/gtest_smoke/tests_main.cpp deleted file mode 100644 index 660438fe72c..00000000000 --- a/tests/tt_metal/tt_metal/gtest_smoke/tests_main.cpp +++ /dev/null @@ -1,9 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "gtest/gtest.h" - -/* -All the tests in gtest_smoke are symlinks. This test suite is meant to be used for sanity checks. -*/ diff --git a/tests/tt_metal/tt_metal/integration/CMakeLists.txt b/tests/tt_metal/tt_metal/integration/CMakeLists.txt new file mode 100644 index 00000000000..45df1c02483 --- /dev/null +++ b/tests/tt_metal/tt_metal/integration/CMakeLists.txt @@ -0,0 +1,34 @@ +set(UNIT_TESTS_INTEGRATION_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/test_autonomous_relay_streams.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_basic_pipeline.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_flatten.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sfpu_compute.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_large_block.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_multi_core_X_dram.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_single_core.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_X_tile.cpp +) + +add_executable(unit_tests_integration ${UNIT_TESTS_INTEGRATION_SRC}) +TT_ENABLE_UNITY_BUILD(unit_tests_integration) + +target_link_libraries(unit_tests_integration PUBLIC test_metal_common_libs) +target_include_directories( + unit_tests_integration + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) +set_target_properties( + unit_tests_integration + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp index 3c452584cdf..4af31133b7a 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp @@ -2,11 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include -#include - -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "dispatch_fixture.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "common/bfloat16.hpp" @@ -15,7 +11,7 @@ #include "test_tiles.hpp" #include "tests/tt_metal/test_utils/tilization.hpp" #include "tests/tt_metal/test_utils/print_helpers.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +#include "matmul_test_utils.hpp" using std::vector; using namespace tt; @@ -83,7 +79,7 @@ void set_math_fid_masks(uint16_t &math_fid_mask, MathFidelity math_fidelity = Ma } } -void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulTileConfig &cfg, vector activations, vector weights, vector tensor_vals){ +void matmul_tile(DispatchFixture *fixture, tt_metal::Device *device, const MatmulTileConfig &cfg, vector activations, vector weights, vector tensor_vals){ tt_metal::Program program = tt_metal::CreateProgram(); CoreCoord core = {0, 0}; @@ -351,7 +347,7 @@ using namespace unit_tests_common::matmul::test_matmul_X_tile; } */ -TEST_F(CommonFixture, MatmulSingleTile){ +TEST_F(DispatchFixture, TensixMatmulSingleTile){ for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; for (bool fp32_dest_acc_en : {true, false}) { @@ -377,7 +373,7 @@ TEST_F(CommonFixture, MatmulSingleTile){ } } -TEST_F(CommonFixture, MatmulMultiTile){ +TEST_F(DispatchFixture, TensixMatmulMultiTile){ for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; for (bool fp32_dest_acc_en : {true, false}) { @@ -410,7 +406,7 @@ TEST_F(CommonFixture, MatmulMultiTile){ } } -TEST_F(CommonFixture, MatmulBlock){ +TEST_F(DispatchFixture, TensixMatmulBlock){ for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; for (bool fp32_dest_acc_en : {true, false}) { @@ -441,7 +437,7 @@ TEST_F(CommonFixture, MatmulBlock){ } } -TEST_F(CommonFixture, MatmulBlockInitShort){ +TEST_F(DispatchFixture, TensixMatmulBlockInitShort){ for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; for (bool fp32_dest_acc_en : {true, false}) { @@ -472,7 +468,7 @@ TEST_F(CommonFixture, MatmulBlockInitShort){ } } -TEST_F(CommonFixture, MatmulBlockInitShortWithDt){ +TEST_F(DispatchFixture, TensixMatmulBlockInitShortWithDt){ for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; for (bool fp32_dest_acc_en : {true, false}) { diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp index 42061795180..88f84f8c1fc 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp @@ -2,19 +2,14 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include -#include - -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "dispatch_fixture.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "common/bfloat16.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" #include "test_tiles.hpp" #include "tests/tt_metal/test_utils/tilization.hpp" -#include "tests/tt_metal/test_utils/print_helpers.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +#include "matmul_test_utils.hpp" using std::vector; using namespace tt; @@ -137,7 +132,7 @@ void create_CBs_for_fused_matmul(tt_metal::Program &program, tt_metal::Device* d } } -bool matmul_large_block(CommonFixture *fixture, tt_metal::Device *device, bool activations_rm, bool output_rm, MathFidelity math_fidelity = MathFidelity::HiFi4) { +bool matmul_large_block(DispatchFixture *fixture, tt_metal::Device *device, bool activations_rm, bool output_rm, MathFidelity math_fidelity = MathFidelity::HiFi4) { bool pass = true; tt_metal::Program program = tt_metal::CreateProgram(); @@ -365,7 +360,7 @@ bool matmul_large_block(CommonFixture *fixture, tt_metal::Device *device, bool a } -TEST_F(CommonFixture, MatmulLargeBlock) { +TEST_F(DispatchFixture, TensixMatmulLargeBlock) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue;; tt::log_info(tt::LogTest, "Math Fidelity = {}", i); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp index 8371a43d96c..dce5db7596c 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp @@ -3,10 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include -#include -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "dispatch_fixture.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "common/bfloat16.hpp" @@ -14,8 +12,8 @@ #include "test_tiles.hpp" #include "tt_metal/impl/dispatch/command_queue.hpp" #include "tests/tt_metal/test_utils/tilization.hpp" -#include "tests/tt_metal/test_utils/print_helpers.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +#include "matmul_test_utils.hpp" + using std::vector; using namespace tt; @@ -368,7 +366,7 @@ bool assign_runtime_args_to_program( return pass; } -bool matmul_multi_core_multi_dram(CommonFixture *fixture, tt_metal::Device *device){ +bool matmul_multi_core_multi_dram(DispatchFixture *fixture, tt_metal::Device *device){ bool pass = true; int num_cores_r = device->compute_with_storage_grid_size().y; int num_cores_c = device->compute_with_storage_grid_size().x; @@ -498,7 +496,7 @@ bool matmul_multi_core_multi_dram(CommonFixture *fixture, tt_metal::Device *devi } -TEST_F(CommonFixture, MatmulMultiCoreSingleDRAM){ +TEST_F(DispatchFixture, TensixMatmulMultiCoreSingleDRAM){ const char* arch = getenv("ARCH_NAME"); if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ log_info(LogTest, "This test is only supported in slow dispatch mode"); @@ -512,7 +510,7 @@ TEST_F(CommonFixture, MatmulMultiCoreSingleDRAM){ } } -TEST_F(CommonFixture, MatmulMultiCoreMultiDRAM){ +TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAM){ // need to update move_tiles_to_dram to support both slow and fast if (getenv("TT_METAL_SLOW_DISPATCH_MODE")){ log_info(LogTest, "This test is not supported in slow dispatch mode, need to update move_tiles_to_dram.."); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp index 6dff35cf86f..aaeba58f704 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp @@ -3,10 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include -#include -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "dispatch_fixture.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "common/bfloat16.hpp" @@ -14,8 +12,8 @@ #include "test_tiles.hpp" #include "hostdevcommon/common_values.hpp" #include "tests/tt_metal/test_utils/tilization.hpp" -#include "tests/tt_metal/test_utils/print_helpers.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +#include "matmul_test_utils.hpp" + using std::vector; using namespace tt; namespace unit_tests_common::matmul::test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast { @@ -473,7 +471,7 @@ bool matmul_multi_core_multi_dram_in0_mcast_in1_mcast(tt_metal::Device *device){ } // namespace unit_tests_common::matmul::test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast -TEST_F(CommonFixture, MatmulMultiCoreMultiDRAMIn0MCastIn1MCast) { +TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAMIn0MCastIn1MCast) { if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode"); GTEST_SKIP(); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp index 2dfa1ec9ba3..47a798e4ff2 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp @@ -3,10 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include -#include -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "dispatch_fixture.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "common/bfloat16.hpp" @@ -14,8 +12,8 @@ #include "test_tiles.hpp" #include "hostdevcommon/common_values.hpp" #include "tests/tt_metal/test_utils/tilization.hpp" -#include "tests/tt_metal/test_utils/print_helpers.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +#include "matmul_test_utils.hpp" + using std::vector; using namespace tt; @@ -390,7 +388,7 @@ bool matmul_multi_core_multi_dram_inX_mcast(tt_metal::Device *device, int in1_or } } // namespace unit_tests_common::matmul::test_matmul_multi_core -TEST_F(CommonFixture, MatmulMultiCoreMultiDRAMIn0MCast) { +TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAMIn0MCast) { if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode"); GTEST_SKIP(); @@ -400,7 +398,7 @@ TEST_F(CommonFixture, MatmulMultiCoreMultiDRAMIn0MCast) { } } -TEST_F(CommonFixture, MatmulMultiCoreMultiDRAMIn1MCast) { +TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAMIn1MCast) { if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode"); GTEST_SKIP(); diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp index 167cfb880ed..62831212fa0 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp @@ -2,11 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include -#include - -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "dispatch_fixture.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "common/bfloat16.hpp" @@ -15,14 +11,14 @@ #include "test_tiles.hpp" #include "tests/tt_metal/test_utils/tilization.hpp" #include "tests/tt_metal/test_utils/print_helpers.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +#include "matmul_test_utils.hpp" using std::vector; using namespace tt; namespace unit_tests_common::matmul::test_matmul_single_core{ -bool matmul_single_core(CommonFixture *fixture, tt_metal::Device *device, int M, int N, int K, int out_subblock_h, int out_subblock_w){ +bool matmul_single_core(DispatchFixture *fixture, tt_metal::Device *device, int M, int N, int K, int out_subblock_h, int out_subblock_w){ bool pass = true; tt_metal::Program program = tt_metal::CreateProgram(); @@ -216,7 +212,7 @@ bool matmul_single_core(CommonFixture *fixture, tt_metal::Device *device, int M, } } // namespace unit_tests_common::matmul::test_matmul_single_core -TEST_F (CommonFixture, MatmulSingleCoreSmall){ +TEST_F (DispatchFixture, TensixMatmulSingleCoreSmall){ uint32_t M = 4; uint32_t K = 4; uint32_t N = 4; @@ -227,7 +223,7 @@ TEST_F (CommonFixture, MatmulSingleCoreSmall){ } } -TEST_F (CommonFixture, MatmulSingleCore){ +TEST_F (DispatchFixture, TensixMatmulSingleCore){ if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ log_info(LogTest, "Fast dispatch buffer memory issue, skipping for now"); GTEST_SKIP(); diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp rename to tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp index 74080be0bb8..c4911a7c230 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp +++ b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp @@ -10,9 +10,9 @@ #include #include -#include "device/tt_arch_types.h" #include "gtest/gtest.h" -#include "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp" +#include "device/tt_arch_types.h" +#include "command_queue_fixture.hpp" #include "tt_metal/common/logger.hpp" #include "impl/device/device.hpp" #include "impl/buffers/circular_buffer.hpp" @@ -26,7 +26,6 @@ #include "tt_metal/test_utils/comparison.hpp" #include "tt_metal/test_utils/df/df.hpp" #include "tt_metal/test_utils/env_vars.hpp" -// #include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/detail/persistent_kernel_cache.hpp" #include "tt_metal/test_utils/stimulus.hpp" @@ -267,11 +266,6 @@ void build_and_run_autonomous_stream_test( uint32_t buffer_size_bytes = num_messages * page_size; auto inputs = test_utils::generate_uniform_random_vector(0, 100, buffer_size_bytes / sizeof(uint32_t)); std::iota(inputs.begin(), inputs.end(), 1); - // for (auto i = 0; i < inputs.size(); i += page_size) { - // for (auto ii = 0; ii < std::min(page_size, inputs.size() - i); ii++) { - // inputs.at(i + ii) = i + 1; - // } - // } auto zeroes_buffer = std::vector(buffer_size_bytes / sizeof(uint32_t), 0); std::vector outputs(buffer_size_bytes / sizeof(uint32_t), 0); @@ -648,7 +642,7 @@ void build_and_run_autonomous_stream_test( } // namespace tt -TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreams) { +TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreams) { auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); if (arch == tt::ARCH::GRAYSKULL) { @@ -691,7 +685,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreams) { return; } -TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsSmallPackets) { +TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreamsSmallPackets) { auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); if (arch == tt::ARCH::GRAYSKULL) { @@ -734,7 +728,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsSmallPackets) { return; } -TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingShort) { +TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreamsLoopingShort) { auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); if (arch == tt::ARCH::GRAYSKULL) { @@ -780,7 +774,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingShort) { // Too long to run in post commit and these kernels are currently only live in these unit tests anyways // so we just enable a couple of the unit tests to ensure nobody accidentally introduces compile errors // or anything like that -TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingRandomShort) { +TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreamsLoopingRandomShort) { auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); // if (num_devices != 8) { @@ -835,13 +829,9 @@ TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingRandomShor // Too long to run in post commit and these kernels are currently only live in these unit tests anyways // so we just enable a couple of the unit tests to ensure nobody accidentally introduces compile errors // or anything like that -TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingLong) { +TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreamsLoopingLong) { auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); - // if (num_devices != 8) { - // log_info(tt::LogTest, "Need at least 2 devices to run this test"); - // return; - // } if (arch == tt::ARCH::GRAYSKULL) { log_info(tt::LogTest, "Test must be run on WH"); return; @@ -885,7 +875,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingLong) { // Too long to run in post commit and these kernels are currently only live in these unit tests anyways // so we just enable a couple of the unit tests to ensure nobody accidentally introduces compile errors // or anything like that -TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsSweep) { +TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreamsSweep) { auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); auto num_devices = tt::tt_metal::GetNumAvailableDevices(); if (arch == tt::ARCH::GRAYSKULL) { diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp b/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp similarity index 96% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp rename to tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp index 3fc76d32d74..b5fff829dba 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp +++ b/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp @@ -13,13 +13,10 @@ #include #include "tt_metal/common/bfloat16.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp" +#include "command_queue_fixture.hpp" #include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/host_api.hpp" +#include "host_api.hpp" #include "tt_metal/impl/dispatch/command_queue.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" -#include "tt_metal/test_utils/stimulus.hpp" #include "tt_metal/impl/device/device.hpp" using std::map; @@ -44,10 +41,6 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf tt_metal::Program program = tt_metal::CreateProgram(); - // uint32_t num_tiles = 32; - // uint32_t block_size_tiles = 16; - // uint32_t num_blocks_in_CB = 2; - // uint32_t num_repetitions = 1; uint32_t num_cores = (uint32_t)test_config.num_cores; uint32_t num_tiles = (uint32_t)test_config.num_tiles; uint32_t block_size_tiles = (uint32_t)test_config.block_size_tiles; @@ -245,7 +238,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf } // namespace unit_tests::create_pipeline -TEST_F(CommandQueueFixture, TestPipelineAcrossRows) { +TEST_F(CommandQueueProgramFixture, TensixTestPipelineAcrossRows) { if (this->arch_ != tt::ARCH::GRAYSKULL) { GTEST_SKIP(); } diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp b/tests/tt_metal/tt_metal/integration/test_flatten.cpp similarity index 54% rename from tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp rename to tests/tt_metal/tt_metal/integration/test_flatten.cpp index 5dbadc80812..27507bf32c0 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp +++ b/tests/tt_metal/tt_metal/integration/test_flatten.cpp @@ -2,22 +2,16 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include -#include - -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "dispatch_fixture.hpp" +#include "command_queue_fixture.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "common/bfloat16.hpp" -#include "llrt/llrt.hpp" - - using std::vector; using namespace tt; -namespace gtest_smoke::test_flatten{ +namespace test_flatten { uint32_t prod(vector &shape) { uint32_t shape_prod = 1; @@ -66,7 +60,7 @@ inline std::vector gold_standard_flatten(std::vector src_vec return expected_dst_vec; } -bool flatten(CommonFixture *fixture, tt_metal::Device *device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) { +bool flatten(DispatchFixture *fixture, tt_metal::Device *device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) { bool pass = true; tt_metal::Program program = tt_metal::CreateProgram(); @@ -189,22 +183,163 @@ bool flatten(CommonFixture *fixture, tt_metal::Device *device, uint32_t num_tile return pass; } +bool flatten_stress(Device *device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) { + // Test Simulating Program Caching with Async Command Queues + bool pass = true; + // Create a program used across all loops + Program program = CreateProgram(); + + CoreCoord core = {0, 0}; + + uint32_t single_tile_size = 2 * 1024; + + uint32_t num_tiles = num_tiles_r * num_tiles_c; + uint32_t num_bytes_per_tensor_row = num_tiles_c * 64; + uint32_t num_bytes_per_tile = num_tiles * single_tile_size; + + uint32_t dram_buffer_size = single_tile_size * num_tiles * 32; + + InterleavedBufferConfig dram_config{ + .device=device, + .size = dram_buffer_size, + .page_size = dram_buffer_size, + .buffer_type = BufferType::DRAM + }; + uint32_t src0_cb_index = 0; + uint32_t num_input_tiles = 8; + CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(src0_cb_index, single_tile_size); + auto cb_src0 = CreateCircularBuffer(program, core, cb_src0_config); + + uint32_t ouput_cb_index = 16; + uint32_t num_output_tiles = 1; + CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(ouput_cb_index, single_tile_size); + auto cb_output = CreateCircularBuffer(program, core, cb_output_config); + + auto flatten_kernel = CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp", + core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + + auto unary_writer_kernel = CreateKernel( + program, + "tt_metal/kernels/dataflow/writer_unary.cpp", + core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + vector compute_kernel_args = { + num_tiles * 32 + }; + + auto eltwise_unary_kernel = CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", + core, + ComputeConfig{.compile_args = compute_kernel_args} + ); + + // Inside the loop, run async runtime functions + for (int i = 0; i < 1000; i++) { + // Create Device Buffers Asynchronously + auto src_dram_buffer = CreateBuffer(dram_config); + auto dst_dram_buffer = CreateBuffer(dram_config); + + auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); + auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); + // Create the source vector + std::shared_ptr> src_vec = std::make_shared>(create_random_vector_of_bfloat16( + dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count())); + + std::vector golden = gold_standard_flatten(*src_vec, {num_tiles_r * 32, num_tiles_c * 32}); + // Set the runtime args asynchronously + std::shared_ptr writer_runtime_args = std::make_shared(); + std::shared_ptr compute_runtime_args = std::make_shared(); + *compute_runtime_args = { + src_dram_buffer.get(), + (std::uint32_t)dram_src_noc_xy.x, + (std::uint32_t)dram_src_noc_xy.y, + num_tiles_r, + num_tiles_c, + num_bytes_per_tensor_row + }; + *writer_runtime_args = { + dst_dram_buffer.get(), + (std::uint32_t)dram_dst_noc_xy.x, + (std::uint32_t)dram_dst_noc_xy.y, + num_tiles * 32 + }; + + SetRuntimeArgs( + device, + detail::GetKernel(program, flatten_kernel), + core, + compute_runtime_args); + + SetRuntimeArgs( + device, + detail::GetKernel(program, unary_writer_kernel), + core, + writer_runtime_args); + // Async write input + EnqueueWriteBuffer(device->command_queue(), src_dram_buffer, src_vec, false); + // Share ownership of buffer with program + AssignGlobalBufferToProgram(src_dram_buffer, program); + // Main thread gives up ownership of buffer and src data (this is what python does) + src_dram_buffer.reset(); + src_vec.reset(); + // Queue up program + EnqueueProgram(device->command_queue(), program, false); + // Blocking read + std::vector result_vec; + EnqueueReadBuffer(device->command_queue(), dst_dram_buffer, result_vec, true); + + // Validation of data + TT_FATAL(golden.size() == result_vec.size(), "Size mismatch between golden {} and result vec {}.", golden.size(), result_vec.size()); + pass &= (golden == result_vec); + + if (not pass) { + std::cout << "GOLDEN" << std::endl; + print_vec_of_uint32_as_packed_bfloat16(golden, num_tiles * 32); + + std::cout << "RESULT" << std::endl; + print_vec_of_uint32_as_packed_bfloat16(result_vec, num_tiles * 32); + } + } + return pass; } -TEST_F(CommonFixture, Flatten){ +} + +TEST_F(DispatchFixture, TensixFlatten){ // TODO: Re-enable when #7264 is fixed GTEST_SKIP(); uint32_t num_tiles_r = 2; uint32_t num_tiles_c = 2; - if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ + if (!this->IsSlowDispatch()){ log_info(LogTest, "Flatten running with num_tiles_r=1, num_tiles_c=1"); num_tiles_r = 1; num_tiles_c = 1; } for (unsigned int id=0; id < devices_.size(); id++){ // TODO: #6097, fix this for fast dispatch remote device. - if (!this->slow_dispatch_ && id > 0) + if (!this->IsSlowDispatch() && id > 0) continue; - ASSERT_TRUE(gtest_smoke::test_flatten::flatten(this, devices_.at(id), num_tiles_r, num_tiles_c)); + ASSERT_TRUE(test_flatten::flatten(this, this->devices_.at(id), num_tiles_r, num_tiles_c)); + } +} + +TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAsyncFlattenStress) { + auto &command_queue = this->device_->command_queue(); + auto current_mode = CommandQueue::default_mode(); + command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); + uint32_t num_tiles_r = 2; + uint32_t num_tiles_c = 2; + if (!this->IsSlowDispatch()) { + num_tiles_r = 1; + num_tiles_c = 1; } + ASSERT_TRUE(test_flatten::flatten_stress(this->device_, num_tiles_r, num_tiles_c)); + command_queue.set_mode(current_mode); } diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp rename to tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp index 06cd4a16177..977f495773e 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp @@ -6,16 +6,12 @@ #include #include -#include -#include #include "command_queue_fixture.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/impl/dispatch/command_queue.hpp" #include "tt_metal/test_utils/comparison.hpp" -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" #include "tt_metal/test_utils/stimulus.hpp" #include "tt_metal/impl/device/device.hpp" @@ -23,7 +19,6 @@ using std::map; using std::vector; using namespace tt; using namespace tt::test_utils; -using namespace tt::test_utils::df; using namespace tt::tt_metal; namespace unit_tests::sfpu_util { @@ -233,7 +228,7 @@ bool run_sfpu_all_same_buffer(CommandQueue & cq, const SfpuConfig& test_config) class SingleCoreSingleCardSfpuParameterizedFixture : public CommandQueueSingleCardFixture, public testing::WithParamInterface> { }; -TEST_P(SingleCoreSingleCardSfpuParameterizedFixture, SfpuCompute) { +TEST_P(SingleCoreSingleCardSfpuParameterizedFixture, TensixSfpuCompute) { for (Device* device_: devices_) { size_t num_tiles = std::get<0>(GetParam()); string sfpu_op = std::get<1>(GetParam()); @@ -279,7 +274,7 @@ class SingleCoreSingleCardSfpuParameterizedApproxFixture : public CommandQueueSingleCardFixture, public testing::WithParamInterface> {}; -TEST_P(SingleCoreSingleCardSfpuParameterizedApproxFixture, SfpuCompute) { +TEST_P(SingleCoreSingleCardSfpuParameterizedApproxFixture, TensixSfpuCompute) { for (Device* device_: devices_) { size_t num_tiles = std::get<0>(GetParam()); string sfpu_op = std::get<1>(GetParam()); @@ -326,7 +321,7 @@ class MultiCoreSingleCardSfpuParameterizedApproxFixture : public CommandQueueSingleCardFixture, public testing::WithParamInterface> {}; -TEST_P(MultiCoreSingleCardSfpuParameterizedApproxFixture, AllCoreMultiTileSfpuApproxCompute) { +TEST_P(MultiCoreSingleCardSfpuParameterizedApproxFixture, TensixAllCoreMultiTileSfpuApproxCompute) { for (Device* device_: devices_) { size_t num_tiles = std::get<0>(GetParam()); diff --git a/tests/tt_metal/tt_metal/llk/CMakeLists.txt b/tests/tt_metal/tt_metal/llk/CMakeLists.txt new file mode 100644 index 00000000000..e2b41060099 --- /dev/null +++ b/tests/tt_metal/tt_metal/llk/CMakeLists.txt @@ -0,0 +1,36 @@ +set(UNIT_TESTS_LLK_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/test_broadcast.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_copy_block_matmul_partials.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_cumsum.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_dropout_sfpu_compute.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_golden_impls.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_reconfig.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_reduce.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sfpu_compute.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_single_core_binary_compute.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_single_core_matmul_compute.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_transpose.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_untilize_tilize.cpp +) + +add_executable(unit_tests_llk ${UNIT_TESTS_LLK_SRC}) +TT_ENABLE_UNITY_BUILD(unit_tests_llk) + +target_link_libraries(unit_tests_llk PUBLIC test_metal_common_libs) +target_include_directories( + unit_tests_llk + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) +set_target_properties( + unit_tests_llk + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp b/tests/tt_metal/tt_metal/llk/test_broadcast.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp rename to tests/tt_metal/tt_metal/llk/test_broadcast.cpp index 43963dc422e..5642aa3dfed 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp +++ b/tests/tt_metal/tt_metal/llk/test_broadcast.cpp @@ -289,11 +289,11 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig& } } -class BroadcastParametrizedDeviceFixture : public DeviceFixture, +class BroadcastParameterizedDeviceFixture : public DeviceFixture, public testing::WithParamInterface { }; -TEST_P(BroadcastParametrizedDeviceFixture, ComputeSingleTileBroadcast) { +TEST_P(BroadcastParameterizedDeviceFixture, TensixComputeSingleTileBroadcast) { unit_tests::compute::broadcast::BroadcastConfig test_config = GetParam(); for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; @@ -307,7 +307,7 @@ using namespace unit_tests::compute::broadcast; INSTANTIATE_TEST_SUITE_P( ComputeSingleTileBroadcast, - BroadcastParametrizedDeviceFixture, + BroadcastParameterizedDeviceFixture, ::testing::Values( (BroadcastConfig){ApiConvention::DEFAULT, EltwiseOp::ADD, BroadcastDim::ROW}, (BroadcastConfig){ApiConvention::DEFAULT, EltwiseOp::ADD, BroadcastDim::COL}, diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp b/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp rename to tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp index 54b747da19a..8fae6bd2f93 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp +++ b/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp @@ -170,7 +170,7 @@ void run_single_core_copy_block_matmul_partials(tt_metal::Device* device, const // - matmul_pack_tile //////////////////////////////////////////////////////////////////////////// -TEST_F(DeviceFixture, DISABLED_ComputeCopyBlockSingle) { +TEST_F(DeviceFixture, DISABLED_TensixComputeCopyBlockSingle) { for (bool fp32_dest_acc_en : {true, false}) { // FP32 dest acc not possible for GS if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; @@ -185,7 +185,7 @@ TEST_F(DeviceFixture, DISABLED_ComputeCopyBlockSingle) { } } } -TEST_F(DeviceFixture, ComputeCopyBlockMultiple) { +TEST_F(DeviceFixture, TensixComputeCopyBlockMultiple) { for (bool fp32_dest_acc_en : {true, false}) { // FP32 dest acc not possible for GS if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; @@ -204,7 +204,7 @@ TEST_F(DeviceFixture, ComputeCopyBlockMultiple) { } } -TEST_F(DeviceFixture, ComputeCopyBlockComputeBottleneck) { +TEST_F(DeviceFixture, TensixComputeCopyBlockComputeBottleneck) { for (bool fp32_dest_acc_en : {true, false}) { // FP32 dest acc not possible for GS if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue; diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp b/tests/tt_metal/tt_metal/llk/test_cumsum.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp rename to tests/tt_metal/tt_metal/llk/test_cumsum.cpp index 66119879e6c..5bcd766d02c 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp +++ b/tests/tt_metal/tt_metal/llk/test_cumsum.cpp @@ -179,7 +179,7 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c } } -TEST_F(DeviceFixture, ComputeCumsumColumnwise) { +TEST_F(DeviceFixture, TensixComputeCumsumColumnwise) { auto arch = this->arch_; if (arch == tt::ARCH::GRAYSKULL) { GTEST_SKIP(); // Not implemented for GRAYSKULL @@ -201,7 +201,7 @@ TEST_F(DeviceFixture, ComputeCumsumColumnwise) { } } -TEST_F(DeviceFixture, ComputeCumsumRowwise) { +TEST_F(DeviceFixture, TensixComputeCumsumRowwise) { auto arch = this->arch_; if (arch == tt::ARCH::GRAYSKULL) { GTEST_SKIP(); // Not implemented for GRAYSKULL diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp b/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp similarity index 99% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp rename to tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp index 655aeb87cfe..e0f1a53b062 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp @@ -239,7 +239,7 @@ void test_dropout(tt_metal::Device* device, const DropoutConfig& test_config) { } -TEST_F(DeviceFixture, ComputeDropout) { +TEST_F(DeviceFixture, TensixComputeDropout) { if (this->arch_ != tt::ARCH::WORMHOLE_B0) { GTEST_SKIP(); } diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp b/tests/tt_metal/tt_metal/llk/test_golden_impls.cpp similarity index 100% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp rename to tests/tt_metal/tt_metal/llk/test_golden_impls.cpp diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp b/tests/tt_metal/tt_metal/llk/test_golden_impls.hpp similarity index 100% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp rename to tests/tt_metal/tt_metal/llk/test_golden_impls.hpp diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp similarity index 99% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp rename to tests/tt_metal/tt_metal/llk/test_reconfig.cpp index b55c6329938..df7f9810809 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp +++ b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp @@ -324,7 +324,7 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c // - pack_reconfig_l1_acc //////////////////////////////////////////////////////////////////////////// -TEST_F(DeviceFixture, TileCopyReconfigExplicitSplitDstAcc) { +TEST_F(DeviceFixture, TensixTileCopyReconfigExplicitSplitDstAcc) { auto arch = this->arch_; if (arch == tt::ARCH::GRAYSKULL) { GTEST_SKIP(); @@ -363,7 +363,7 @@ TEST_F(DeviceFixture, TileCopyReconfigExplicitSplitDstAcc) { } } -TEST_F(DeviceFixture, TileCopyReconfigL1Acc) { +TEST_F(DeviceFixture, TensixTileCopyReconfigL1Acc) { auto arch = this->arch_; if (arch == tt::ARCH::GRAYSKULL) { GTEST_SKIP(); diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp b/tests/tt_metal/tt_metal/llk/test_reduce.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp rename to tests/tt_metal/tt_metal/llk/test_reduce.cpp index 926af4510f7..6ccc3164056 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp +++ b/tests/tt_metal/tt_metal/llk/test_reduce.cpp @@ -384,7 +384,7 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig using namespace unit_tests::compute::reduce; -TEST_F(DeviceFixture, ComputeReduceH) { +TEST_F(DeviceFixture, TensixComputeReduceH) { if (this->arch_ != tt::ARCH::BLACKHOLE) { // (issue #10181: disabling due to sporadic failures in slow dispatch mode) GTEST_SKIP(); @@ -422,7 +422,7 @@ TEST_F(DeviceFixture, ComputeReduceH) { } } -TEST_F(DeviceFixture, ComputeReduceW) { +TEST_F(DeviceFixture, TensixComputeReduceW) { std::vector shape = {1, 3, 17*TILE_HEIGHT, 19*TILE_WIDTH}; std::vector result_shape = {shape[0], shape[1], shape[2], 32}; for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) { @@ -457,7 +457,7 @@ TEST_F(DeviceFixture, ComputeReduceW) { } } // Disabled due to GH issue #14510 -TEST_F(DeviceFixture, DISABLED_ComputeReduceHW) { +TEST_F(DeviceFixture, DISABLED_TensixComputeReduceHW) { std::vector shape = {1, 2, 7*TILE_HEIGHT, 5*TILE_WIDTH}; std::vector result_shape = {shape[0], shape[1], 32, 32}; for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) { @@ -493,7 +493,7 @@ TEST_F(DeviceFixture, DISABLED_ComputeReduceHW) { } } -TEST_F(DeviceFixture, ComputeReduceHMathOnly) { +TEST_F(DeviceFixture, TensixComputeReduceHMathOnly) { if (this->arch_ != tt::ARCH::BLACKHOLE) { // (issue #10181: disabling due to sporadic failures in slow dispatch mode) GTEST_SKIP(); @@ -532,7 +532,7 @@ TEST_F(DeviceFixture, ComputeReduceHMathOnly) { } } -TEST_F(DeviceFixture, ComputeReduceWMathOnly) { +TEST_F(DeviceFixture, TensixComputeReduceWMathOnly) { std::vector shape = {1, 3, 17*TILE_HEIGHT, 19*TILE_WIDTH}; std::vector result_shape = {shape[0], shape[1], shape[2], 32}; for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) { @@ -568,7 +568,7 @@ TEST_F(DeviceFixture, ComputeReduceWMathOnly) { } } // Disabled due to GH issue #14510 -TEST_F(DeviceFixture, DISABLED_ComputeReduceHWMathOnly) { +TEST_F(DeviceFixture, DISABLED_TensixComputeReduceHWMathOnly) { std::vector shape = {1, 2, 7*TILE_HEIGHT, 5*TILE_WIDTH}; std::vector result_shape = {shape[0], shape[1], 32, 32}; for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) { @@ -605,7 +605,7 @@ TEST_F(DeviceFixture, DISABLED_ComputeReduceHWMathOnly) { } } -TEST_F(DeviceFixture, ComputeReduceHShortInit) { +TEST_F(DeviceFixture, TensixComputeReduceHShortInit) { if (this->arch_ != tt::ARCH::BLACKHOLE) { // (issue #10181: disabling due to sporadic failures in slow dispatch mode) GTEST_SKIP(); @@ -644,7 +644,7 @@ TEST_F(DeviceFixture, ComputeReduceHShortInit) { } } -TEST_F(DeviceFixture, ComputeReduceWShortInit) { +TEST_F(DeviceFixture, TensixComputeReduceWShortInit) { std::vector shape = {1, 3, 17*TILE_HEIGHT, 19*TILE_WIDTH}; std::vector result_shape = {shape[0], shape[1], shape[2], 32}; for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) { @@ -680,7 +680,7 @@ TEST_F(DeviceFixture, ComputeReduceWShortInit) { } } // Disabled due to GH issue #14510 -TEST_F(DeviceFixture, DISABLED_ComputeReduceHWShortInit) { +TEST_F(DeviceFixture, DISABLED_TensixComputeReduceHWShortInit) { std::vector shape = {1, 2, 7*TILE_HEIGHT, 5*TILE_WIDTH}; std::vector result_shape = {shape[0], shape[1], 32, 32}; for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) { diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp rename to tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp index 35ffb316d01..a3c1a77cc29 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp @@ -228,7 +228,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c class SingleCoreSingleDeviceSfpuParameterizedFixture : public DeviceFixture, public testing::WithParamInterface> { }; -TEST_P(SingleCoreSingleDeviceSfpuParameterizedFixture, SfpuCompute) { +TEST_P(SingleCoreSingleDeviceSfpuParameterizedFixture, TensixSfpuCompute) { size_t num_tiles = std::get<0>(GetParam()); string sfpu_op = std::get<1>(GetParam()); @@ -272,7 +272,7 @@ class SingleCoreSingleDeviceSfpuParameterizedApproxFixture : public DeviceFixture, public testing::WithParamInterface> {}; -TEST_P(SingleCoreSingleDeviceSfpuParameterizedApproxFixture, SfpuCompute) { +TEST_P(SingleCoreSingleDeviceSfpuParameterizedApproxFixture, TensixSfpuCompute) { size_t num_tiles = std::get<0>(GetParam()); string sfpu_op = std::get<1>(GetParam()); @@ -318,7 +318,7 @@ INSTANTIATE_TEST_SUITE_P( std::make_tuple(4, "log"), std::make_tuple(4, "tanh"))); -TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreSingleTileSfpuApproxCompute) { +TEST_F(DeviceFixture, DISABLED_TensixMultiContinguousCoreSingleTileSfpuApproxCompute) { CoreRange core_range({0, 0}, {1, 0}); CoreRangeSet core_range_set({core_range}); unit_tests::compute::sfpu::SfpuConfig test_config = { @@ -356,7 +356,7 @@ TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreSingleTileSfpuApproxCompute) EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); } -TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreMultiTileSfpuApproxCompute) { +TEST_F(DeviceFixture, DISABLED_TensixMultiContinguousCoreMultiTileSfpuApproxCompute) { CoreRange core_range({0, 0}, {1, 0}); CoreRangeSet core_range_set({core_range}); unit_tests::compute::sfpu::SfpuConfig test_config = { @@ -394,7 +394,7 @@ TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreMultiTileSfpuApproxCompute) { test_config.sfpu_op = "tanh"; EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); } -TEST_F(DeviceFixture, DISABLED_AllCoreSingleTileSfpuApproxCompute) { +TEST_F(DeviceFixture, DISABLED_TensixAllCoreSingleTileSfpuApproxCompute) { unit_tests::compute::sfpu::SfpuConfig test_config = { .tile_byte_size = 2 * 32 * 32, .l1_input_data_format = tt::DataFormat::Float16_b, @@ -433,7 +433,7 @@ TEST_F(DeviceFixture, DISABLED_AllCoreSingleTileSfpuApproxCompute) { test_config.sfpu_op = "tanh"; EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config)); } -TEST_F(DeviceFixture, DISABLED_AllCoreMultiTileSfpuApproxCompute) { +TEST_F(DeviceFixture, DISABLED_TensixAllCoreMultiTileSfpuApproxCompute) { unit_tests::compute::sfpu::SfpuConfig test_config = { .tile_byte_size = 2 * 32 * 32, .l1_input_data_format = tt::DataFormat::Float16_b, diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp similarity index 95% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp rename to tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp index 3be28d9843e..721daa15c22 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp @@ -278,7 +278,7 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& } } // namespace unit_tests::compute::binary -TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileAdd) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileAdd) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -296,7 +296,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileAdd) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileSub) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileSub) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -314,7 +314,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileSub) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileMul) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileMul) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -332,7 +332,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileMul) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileAddFullInit) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileAddFullInit) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -351,7 +351,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileAddFullInit) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileSubFullInit) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileSubFullInit) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -370,7 +370,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileSubFullInit) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileMulFullInit) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileMulFullInit) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -389,7 +389,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileMulFullInit) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAddWithDestReuse) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileAddWithDestReuse) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -407,7 +407,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAddWithDestReuse) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSubWithDestReuse) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileSubWithDestReuse) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -425,7 +425,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSubWithDestReuse) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileMulWithDestReuse) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileMulWithDestReuse) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -443,7 +443,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileMulWithDestReuse) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAdd) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileAdd) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -461,7 +461,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAdd) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSub) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileSub) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -479,7 +479,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSub) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileMul) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileMul) { for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) { if (i == 1) continue; unit_tests::compute::binary::SingleCoreBinaryConfig test_config = { @@ -497,7 +497,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileMul) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAddDestAcc) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileAddDestAcc) { auto arch = this->arch_; if (arch == tt::ARCH::GRAYSKULL) { GTEST_SKIP(); @@ -521,7 +521,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAddDestAcc) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSubDestAcc) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileSubDestAcc) { auto arch = this->arch_; if (arch == tt::ARCH::GRAYSKULL) { GTEST_SKIP(); @@ -545,7 +545,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSubDestAcc) { } } -TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileMulDestAcc) { +TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileMulDestAcc) { auto arch = this->arch_; if (arch == tt::ARCH::GRAYSKULL) { GTEST_SKIP(); diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp rename to tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp index 140874255df..df5583ecf29 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp @@ -604,22 +604,22 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N } } // namespace unit_tests::compute::matmul -TEST_F(DeviceFixture, TestSingleCoreSingleTileComputeMatmul) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileComputeMatmul) { for (unsigned int id = 0; id < num_devices_; id++) { ASSERT_TRUE(unit_tests::compute::matmul::single_tile_matmul(this->devices_.at(id))); } } -TEST_F(DeviceFixture, TestSingleCoreSingleBlockSingleTileComputeMatmul) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleBlockSingleTileComputeMatmul) { for (unsigned int id = 0; id < num_devices_; id++) { ASSERT_TRUE(unit_tests::compute::matmul::single_block_matmul(this->devices_.at(id), 1, 1, 1)); } } -TEST_F(DeviceFixture, TestSingleCoreSingleBlockSingleTileAccumulationComputeMatmul) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleBlockSingleTileAccumulationComputeMatmul) { for (unsigned int id = 0; id < num_devices_; id++) { ASSERT_TRUE(unit_tests::compute::matmul::single_block_matmul(this->devices_.at(id), 1, 2, 1)); } } -TEST_F(DeviceFixture, TestSingleCoreSingleBlockSingleTileNoAccumulationComputeMatmul) { +TEST_F(DeviceFixture, TensixTestSingleCoreSingleBlockSingleTileNoAccumulationComputeMatmul) { for (unsigned int id = 0; id < num_devices_; id++) { ASSERT_TRUE(unit_tests::compute::matmul::single_block_matmul(this->devices_.at(id), 2, 1, 2)); } diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp b/tests/tt_metal/tt_metal/llk/test_transpose.cpp similarity index 98% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp rename to tests/tt_metal/tt_metal/llk/test_transpose.cpp index 26fcf5069d0..53781734a61 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp +++ b/tests/tt_metal/tt_metal/llk/test_transpose.cpp @@ -187,7 +187,7 @@ void run_single_core_transpose(tt_metal::Device* device, const TransposeConfig& } // namespace unit_tests::compute::transpose -TEST_F(DeviceFixture, ComputeTransposeWH) { +TEST_F(DeviceFixture, TensixComputeTransposeWH) { unit_tests::compute::transpose::TransposeConfig test_config = { .short_init = false, .single_tile_size = 2 * 1024, @@ -196,7 +196,7 @@ TEST_F(DeviceFixture, ComputeTransposeWH) { unit_tests::compute::transpose::run_single_core_transpose(this->devices_.at(0), test_config); } -TEST_F(DeviceFixture, ComputeTransposeWHShortInit) { +TEST_F(DeviceFixture, TensixComputeTransposeWHShortInit) { unit_tests::compute::transpose::TransposeConfig test_config = { .short_init = true, .single_tile_size = 2 * 1024, diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp b/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp similarity index 97% rename from tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp rename to tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp index c3add81f771..29e8ba09b78 100644 --- a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp +++ b/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp @@ -307,7 +307,7 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& Following tests are for Unpack Tilize ***************************************/ -TEST_F(DeviceFixture, ComputeUnpackTilize) { +TEST_F(DeviceFixture, TensixComputeUnpackTilize) { vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { for (bool fp32_dest_acc_en : {true, false}) { @@ -330,7 +330,7 @@ TEST_F(DeviceFixture, ComputeUnpackTilize) { } } -TEST_F(DeviceFixture, ComputeUnpackTilizeA_B) { +TEST_F(DeviceFixture, TensixComputeUnpackTilizeA_B) { auto arch = this->arch_; if (arch == tt::ARCH::GRAYSKULL) { GTEST_SKIP(); @@ -349,7 +349,7 @@ TEST_F(DeviceFixture, ComputeUnpackTilizeA_B) { } } -TEST_F(DeviceFixture, ComputeUnpackTilizeShortInit) { +TEST_F(DeviceFixture, TensixComputeUnpackTilizeShortInit) { vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { for (bool fp32_dest_acc_en : {true, false}) { @@ -377,7 +377,7 @@ TEST_F(DeviceFixture, ComputeUnpackTilizeShortInit) { Following tests are for Unpack Untilize ***************************************/ -TEST_F(DeviceFixture, ComputeUnpackUntilize) { +TEST_F(DeviceFixture, TensixComputeUnpackUntilize) { vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { for (bool fp32_dest_acc_en : {true, false}) { @@ -400,7 +400,7 @@ TEST_F(DeviceFixture, ComputeUnpackUntilize) { } } -TEST_F(DeviceFixture, ComputeUnpackUntilizeShortInit) { +TEST_F(DeviceFixture, TensixComputeUnpackUntilizeShortInit) { vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { for (bool fp32_dest_acc_en : {true, false}) { @@ -427,7 +427,7 @@ TEST_F(DeviceFixture, ComputeUnpackUntilizeShortInit) { /************************************** Following tests are for pack untilize ***************************************/ -TEST_F(DeviceFixture, ComputePackUntilize) { +TEST_F(DeviceFixture, TensixComputePackUntilize) { vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { for (bool fp32_dest_acc_en : {true, false}) { @@ -450,7 +450,7 @@ TEST_F(DeviceFixture, ComputePackUntilize) { } } -TEST_F(DeviceFixture, ComputePackUntilizeShortInit) { +TEST_F(DeviceFixture, TensixComputePackUntilizeShortInit) { vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { for (bool fp32_dest_acc_en : {true, false}) { @@ -474,7 +474,7 @@ TEST_F(DeviceFixture, ComputePackUntilizeShortInit) { } } -TEST_F(DeviceFixture, ComputePackUntilizeDst) { +TEST_F(DeviceFixture, TensixComputePackUntilizeDst) { vector > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}}; for(auto num_tile : num_tiles) { for (bool dst_full_sync_en : {true, false}) { @@ -495,7 +495,7 @@ TEST_F(DeviceFixture, ComputePackUntilizeDst) { //Tests pack_untilize with tiny tile dims. //Row dim 1x32, which is faces = 2, rows = 1 //Row dim 1x16, which is faces = 1, rows = 1 -TEST_F(DeviceFixture, ComputePackUntilizeDstTinyTile) { +TEST_F(DeviceFixture, TensixComputePackUntilizeDstTinyTile) { vector > test_config_values = {{1, 1, 1, 1}, {1, 1, 2, 1}, {1, 2, 2, 1}}; uint32_t face_c_dim = 16; for(auto test_config_value : test_config_values) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp index f914d3ca87b..39cf7eaa5af 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp @@ -22,7 +22,7 @@ #include "tt_metal/common/work_split.hpp" #include "tests/tt_metal/test_utils/tilization.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +#include "tt_metal/tt_metal/common/matmul_test_utils.hpp" #include using std::vector; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp index 7af8eb29d35..932a66637ef 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp @@ -22,7 +22,7 @@ #include "tt_metal/common/work_split.hpp" #include "tests/tt_metal/test_utils/tilization.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +#include "tt_metal/tt_metal/common/matmul_test_utils.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp index d72ac2a08b1..6d38bd9c9ea 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp @@ -22,10 +22,10 @@ #include "tt_metal/common/constants.hpp" #include -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" #include "tests/tt_metal/test_utils/tilization.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +#include "tt_metal/tt_metal/common/matmul_test_utils.hpp" #include "tt_metal/common/work_split.hpp" using std::vector; diff --git a/tests/tt_metal/tt_metal/stl/CMakeLists.txt b/tests/tt_metal/tt_metal/stl/CMakeLists.txt new file mode 100644 index 00000000000..0f1100b0e6f --- /dev/null +++ b/tests/tt_metal/tt_metal/stl/CMakeLists.txt @@ -0,0 +1,26 @@ +set(UNIT_TESTS_STL_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/test_any_range.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_slotmap.cpp +) + +add_executable(unit_tests_stl ${UNIT_TESTS_STL_SRC}) +TT_ENABLE_UNITY_BUILD(unit_tests_stl) + +target_link_libraries(unit_tests_stl PUBLIC test_metal_common_libs) +target_include_directories( + unit_tests_stl + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) +set_target_properties( + unit_tests_stl + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) diff --git a/tests/tt_metal/tt_metal/unit_tests/tt_stl/test_any_range.cpp b/tests/tt_metal/tt_metal/stl/test_any_range.cpp similarity index 100% rename from tests/tt_metal/tt_metal/unit_tests/tt_stl/test_any_range.cpp rename to tests/tt_metal/tt_metal/stl/test_any_range.cpp diff --git a/tests/tt_metal/tt_metal/unit_tests/tt_stl/slotmap.cpp b/tests/tt_metal/tt_metal/stl/test_slotmap.cpp similarity index 100% rename from tests/tt_metal/tt_metal/unit_tests/tt_stl/slotmap.cpp rename to tests/tt_metal/tt_metal/stl/test_slotmap.cpp diff --git a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/test_enqueue_program.cpp similarity index 100% rename from tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp rename to tests/tt_metal/tt_metal/test_enqueue_program.cpp diff --git a/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp b/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp deleted file mode 100644 index aceb624577e..00000000000 --- a/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp +++ /dev/null @@ -1,134 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include -#include - -#include "assert.hpp" -#include "core_coord.hpp" -#include "detail/tt_metal.hpp" -#include "host_api.hpp" -#include "impl/kernels/data_types.hpp" -#include "impl/program/program.hpp" -#include "llrt/rtoptions.hpp" -#include "tt_cluster_descriptor_types.h" - -using namespace tt; -using namespace tt::tt_metal; -using namespace tt::llrt; - -class CompileProgramWithKernelPathEnvVarFixture : public ::testing::Test { - protected: - void SetUp() override { - this->validate_preconditions(); - - const chip_id_t device_id = 0; - this->device_ = CreateDevice(device_id); - this->program_ = CreateProgram(); - } - - void TearDown() override { CloseDevice(this->device_); } - - void create_kernel(const string &kernel_file) { - CoreCoord core(0, 0); - tt_metal::CreateKernel( - this->program_, - kernel_file, - core, - tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); - } - - void setup_kernel_dir(const string &orig_kernel_file, const string &new_kernel_file) { - const string &kernel_dir = OptionsG.get_kernel_dir(); - const std::filesystem::path &kernel_file_path_under_kernel_dir(kernel_dir + new_kernel_file); - const std::filesystem::path &dirs_under_kernel_dir = kernel_file_path_under_kernel_dir.parent_path(); - std::filesystem::create_directories(dirs_under_kernel_dir); - - const string &metal_root = OptionsG.get_root_dir(); - const std::filesystem::path &kernel_file_path_under_metal_root(metal_root + orig_kernel_file); - std::filesystem::copy(kernel_file_path_under_metal_root, kernel_file_path_under_kernel_dir); - } - - void cleanup_kernel_dir() { - const string &kernel_dir = OptionsG.get_kernel_dir(); - for (const std::filesystem::directory_entry &entry : std::filesystem::directory_iterator(kernel_dir)) { - std::filesystem::remove_all(entry); - } - } - - Device *device_; - Program program_; - - private: - void validate_preconditions() { - this->validate_env_vars_are_set(); - this->validate_kernel_dir_is_valid(); - } - - void validate_env_vars_are_set() { - if (!OptionsG.is_root_dir_specified()) { - GTEST_SKIP() << "Skipping test: TT_METAL_HOME must be set"; - } - if (!OptionsG.is_kernel_dir_specified()) { - GTEST_SKIP() << "Skipping test: TT_METAL_KERNEL_PATH must be set"; - } - } - - void validate_kernel_dir_is_valid() { - const string &kernel_dir = llrt::OptionsG.get_kernel_dir(); - if (!this->does_path_exist(kernel_dir) || !this->is_path_a_directory(kernel_dir) || - !this->is_dir_empty(kernel_dir)) { - GTEST_SKIP() << "Skipping test: TT_METAL_KERNEL_PATH must be an existing, empty directory"; - } - } - - bool does_path_exist(const string &path) { - const std::filesystem::path &file_path(path); - return std::filesystem::exists(file_path); - } - - bool is_path_a_directory(const string &path) { - TT_FATAL(this->does_path_exist(path), "{} does not exist", path); - const std::filesystem::path &file_path(path); - return std::filesystem::is_directory(file_path); - } - - bool is_dir_empty(const string &path) { - TT_FATAL(this->does_path_exist(path), "{} does not exist", path); - TT_FATAL(this->is_path_a_directory(path), "{} is not a directory", path); - const std::filesystem::path &file_path(path); - return std::filesystem::is_empty(file_path); - } -}; - -TEST_F(CompileProgramWithKernelPathEnvVarFixture, KernelUnderMetalRootDir) { - const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp"; - create_kernel(kernel_file); - detail::CompileProgram(this->device_, this->program_); -} - -TEST_F(CompileProgramWithKernelPathEnvVarFixture, KernelUnderKernelRootDir) { - const string &orig_kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp"; - const string &new_kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/new_kernel.cpp"; - this->setup_kernel_dir(orig_kernel_file, new_kernel_file); - this->create_kernel(new_kernel_file); - detail::CompileProgram(this->device_, this->program_); - this->cleanup_kernel_dir(); -} - -TEST_F(CompileProgramWithKernelPathEnvVarFixture, KernelUnderMetalRootDirAndKernelRootDir) { - const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp"; - this->setup_kernel_dir(kernel_file, kernel_file); - this->create_kernel(kernel_file); - detail::CompileProgram(this->device_, this->program_); - this->cleanup_kernel_dir(); -} - -TEST_F(CompileProgramWithKernelPathEnvVarFixture, NonExistentKernel) { - const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/non_existent_kernel.cpp"; - this->create_kernel(kernel_file); - EXPECT_THROW(detail::CompileProgram(this->device_, this->program_), std::exception); -} diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/incrementer.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/incrementer.cpp similarity index 100% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/incrementer.cpp rename to tests/tt_metal/tt_metal/test_kernels/misc/sub_device/incrementer.cpp diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_remote_waiter.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_remote_waiter.cpp similarity index 100% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_remote_waiter.cpp rename to tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_remote_waiter.cpp diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_waiter.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_waiter.cpp similarity index 100% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_waiter.cpp rename to tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_waiter.cpp diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/syncer.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/syncer.cpp similarity index 100% rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/syncer.cpp rename to tests/tt_metal/tt_metal/test_kernels/misc/sub_device/syncer.cpp diff --git a/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt deleted file mode 100644 index 863ee7786e1..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt +++ /dev/null @@ -1,97 +0,0 @@ -set(UNIT_TESTS_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/allocator/test_free_list_allocator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/allocator/test_l1_banking_allocator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/basic/device.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/basic/initialize_semaphores.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/basic/runtime_args.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/basic/test_noc.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/basic/test_soc_descriptor.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/buffer/test_banked.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/buffer/test_buffer_utils.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/buffer/test_sharded_l1.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/buffer/test_simple_dram_buffer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/buffer/test_simple_l1_buffer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_allocation.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_creation.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_non_blocking.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_golden_impls.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_reduce.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_single_core_binary_compute.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_single_core_matmul_compute.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_sfpu_compute.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_dropout_sfpu_compute.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_untilize_tilize.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_copy_block_matmul_partials.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_reconfig.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_transpose.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_broadcast.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_cumsum.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_adjacent.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_contains.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_intersects.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_iterator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_merge.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_construct.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_contains.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_intersects.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_merge.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dram/direct.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/host_apis/test_tilize_untilize.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/basic_eth_kernels.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/buffer_movement_kernels.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/device_cluster_api.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/erisc_app_direct_send.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/ring_gather_kernels.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/global_semaphore/test_global_semaphores.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/tt_stl/test_any_range.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/tt_stl/slotmap.cpp -) - -add_executable( - unit_tests - ${UNIT_TESTS_SRC} - $ -) -TT_ENABLE_UNITY_BUILD(unit_tests) -add_executable(unit_tests_galaxy ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/galaxy_cluster_api.cpp) - -target_link_libraries( - unit_tests - PRIVATE - test_metal_common_libs - Boost::smart_ptr -) -target_link_libraries( - unit_tests_galaxy - PRIVATE - test_metal_common_libs - Boost::smart_ptr -) - -target_include_directories( - unit_tests - PRIVATE - ${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/tt_metal - ${PROJECT_SOURCE_DIR}/tests - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/common - ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer -) -target_include_directories( - unit_tests_galaxy - PRIVATE - ${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/tt_metal - ${PROJECT_SOURCE_DIR}/tests - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/common -) - -set_target_properties( - unit_tests - unit_tests_galaxy - PROPERTIES - RUNTIME_OUTPUT_DIRECTORY - ${PROJECT_BINARY_DIR}/test/tt_metal -) diff --git a/tests/tt_metal/tt_metal/unit_tests/README.md b/tests/tt_metal/tt_metal/unit_tests/README.md deleted file mode 100644 index 55aab607296..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# Summary -Unit testing uses the doctest framework. See https://github.com/doctest/doctest/ -Generally, there are three main levels of organization: -* TEST_SUITE - Used to group main areas of tests -* TEST_CASE - How Test case and sub-case gets split up is at test-writer discretion, but see the test_case section -* SUB_CASE - - -## Build && Execution -### Build -`make tests/tt_metal/unit_tests` -### Get Help -`./build/test/tt_metal/unit_tests --help` -### Execute all tests -`./build/test/tt_metal/unit_tests` -### Execute filtered test-suite -`./build/test/tt_metal/unit_tests -ts="*Sfpu*"` -### List all test-suite with filter -`./build/test/tt_metal/unit_tests -ts="*Sfpu*" -lts` - -## Folder Structure -General structure of the tests are as follows, more sub-folders can be added -
-Directory Structure - Please add any new-tests to a corresponding folder. -
-
-tt_metal/unit_tests/
-  > test_main.cpp
-  > basic/
-    > # Any basic test files can exist here, will be automatically added to test_main
-  > common/
-    > # Used to hold any common structures across all test suites like fixtures
-  > dram/
-    > # Any dram unit/stress test files can exist here, will be automatically added to test_main
-  > compute/
-    > # Any basic test files can exist here, will be automatically added to test_main
-  > new_folders/
-    > # Any test files can exist here, will be automatically added to test_main
-test_utils/
-  > comparison.cpp # Useful utils for comparing, see example usages in unit tests
-  > print_helpers.cpp # Useful utils for printin
-  > stimulus.cpp # Useful utils for generating random vectors or specific vectors, see example usages in unit tests
-  > tilization.cpp # Useful utils for converting between tiled vectors or not, see example usages in unit tests
-
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.hpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.hpp deleted file mode 100644 index e3b46a78266..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.hpp +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "tt_metal/host_api.hpp" - -namespace tt::test::buffer::detail { -void writeL1Backdoor(tt::tt_metal::Device* device, CoreCoord coord, uint32_t address, std::vector& data); -void readL1Backdoor(tt::tt_metal::Device* device, CoreCoord coord, uint32_t address, uint32_t byte_size, std::vector& data); -void writeDramBackdoor(tt::tt_metal::Device* device, uint32_t channel, uint32_t address, std::vector& data); -void readDramBackdoor( - tt::tt_metal::Device* device, uint32_t channel, uint32_t address, uint32_t byte_size, std::vector& data); -} // namespace tt::test::buffer::detail diff --git a/tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp deleted file mode 100644 index f4603b7ec37..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include "tt_metal/common/assert.hpp" -#include "tt_metal/test_utils/env_vars.hpp" - -class BasicFixture : public ::testing::Test { - protected: - void SetUp() override { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (not slow_dispatch) { - TT_THROW("This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set"); - GTEST_SKIP(); - } - } - -}; - -class FDBasicFixture : public ::testing::Test { - protected: - void SetUp() override { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch) { - TT_THROW("This suite can only be run with FD runtime"); - GTEST_SKIP(); - } - } - -}; diff --git a/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp deleted file mode 100644 index 7e638470b1c..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp +++ /dev/null @@ -1,186 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include - -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/impl/device/device_pool.hpp" - -class DeviceFixture : public ::testing::Test { - protected: - void SetUp() override { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (not slow_dispatch) { - TT_THROW("This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set"); - GTEST_SKIP(); - } - arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - - num_devices_ = tt::tt_metal::GetNumAvailableDevices(); - - // Some CI machines have lots of cards, running all tests on all cards is slow - // Coverage for multidevices is decent if we just confirm 2 work - if (arch_ == tt::ARCH::GRAYSKULL && num_devices_ > 2) { - num_devices_ = 2; - } - - std::vector ids; - for (unsigned int id = 0; id < num_devices_; id++) { - ids.push_back(id); - } - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - devices_ = tt::DevicePool::instance().get_all_active_devices(); - } - - void TearDown() override { - tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(false); - for (unsigned int id = 0; id < devices_.size(); id++) { - if (devices_.at(id)->is_initialized()) { - tt::tt_metal::CloseDevice(devices_.at(id)); - } - } - } - - std::vector devices_; - tt::ARCH arch_; - size_t num_devices_; -}; - - -class DeviceSingleCardFixture : public ::testing::Test { - protected: - void SetUp() override { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (not slow_dispatch) { - TT_THROW("This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set"); - GTEST_SKIP(); - } - arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - - const chip_id_t mmio_device_id = 0; - reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id}); - device_ = reserved_devices_.at(mmio_device_id); - - - num_devices_ = reserved_devices_.size(); - } - - void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); } - - tt::tt_metal::Device* device_; - std::map reserved_devices_; - tt::ARCH arch_; - size_t num_devices_; -}; - -class BlackholeSingleCardFixture : public DeviceSingleCardFixture { - protected: - void SetUp() override { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (not slow_dispatch) { - TT_THROW("This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set"); - GTEST_SKIP(); - } - arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name()); - if (arch_ != tt::ARCH::BLACKHOLE) { - GTEST_SKIP(); - } - - const chip_id_t mmio_device_id = 0; - reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id}); - device_ = reserved_devices_.at(mmio_device_id); - - num_devices_ = reserved_devices_.size(); - } -}; - -class GalaxyFixture : public ::testing::Test { - protected: - void SkipTestSuiteIfNotGalaxyMotherboard() - { - const tt::ARCH arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); - if (!(arch == tt::ARCH::WORMHOLE_B0 && num_devices >= 32)) - { - GTEST_SKIP(); - } - } - - void InitializeDevices() - { - const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); - std::vector ids; - for (uint32_t id = 0; id < num_devices; id++) - { - ids.push_back(id); - } - this->device_ids_to_devices_ = tt::tt_metal::detail::CreateDevices(ids); - this->devices_ = tt::DevicePool::instance().get_all_active_devices(); - } - - void SetUp() override - { - this->SkipTestSuiteIfNotGalaxyMotherboard(); - this->InitializeDevices(); - } - - void TearDown() override - { - tt::tt_metal::detail::CloseDevices(this->device_ids_to_devices_); - this->device_ids_to_devices_.clear(); - this->devices_.clear(); - } - - std::vector devices_; - - private: - std::map device_ids_to_devices_; -}; - -class TGFixture : public GalaxyFixture -{ - protected: - void SkipTestSuiteIfNotTG() - { - this->SkipTestSuiteIfNotGalaxyMotherboard(); - const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); - const size_t num_pcie_devices = tt::tt_metal::GetNumPCIeDevices(); - if (!(num_devices == 32 && num_pcie_devices == 4)) - { - GTEST_SKIP(); - } - } - - void SetUp() override - { - this->SkipTestSuiteIfNotTG(); - this->InitializeDevices(); - } -}; - -class TGGFixture : public GalaxyFixture -{ - protected: - void SkipTestSuiteIfNotTGG() - { - this->SkipTestSuiteIfNotGalaxyMotherboard(); - const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); - const size_t num_pcie_devices = tt::tt_metal::GetNumPCIeDevices(); - if (!(num_devices == 64 && num_pcie_devices == 8)) - { - GTEST_SKIP(); - } - } - - void SetUp() override - { - this->SkipTestSuiteIfNotTGG(); - this->InitializeDevices(); - } -}; diff --git a/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp deleted file mode 100644 index 08e57a5cb2a..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include - -#include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/impl/device/device_pool.hpp" - -class N300DeviceFixture : public ::testing::Test { - protected: - void SetUp() override { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (not slow_dispatch) { - TT_THROW("This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set"); - GTEST_SKIP(); - } - arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - - num_devices_ = tt::tt_metal::GetNumAvailableDevices(); - if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() == 2 and - tt::tt_metal::GetNumPCIeDevices() == 1) { - std::vector ids; - for (unsigned int id = 0; id < num_devices_; id++) { - ids.push_back(id); - } - - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - devices_ = tt::DevicePool::instance().get_all_active_devices(); - - } else { - GTEST_SKIP(); - } - } - - void TearDown() override { - tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(false); - for (unsigned int id = 0; id < devices_.size(); id++) { - tt::tt_metal::CloseDevice(devices_.at(id)); - } - } - - std::vector devices_; - tt::ARCH arch_; - size_t num_devices_; -}; diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/basic_eth_kernels.cpp b/tests/tt_metal/tt_metal/unit_tests/ethernet/basic_eth_kernels.cpp deleted file mode 100644 index 4b35868852e..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests/ethernet/basic_eth_kernels.cpp +++ /dev/null @@ -1,895 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include -#include -#include - -#include "device_fixture.hpp" -#include "n300_device_fixture.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/impl/kernels/kernel.hpp" -#include "tt_metal/test_utils/comparison.hpp" -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" -#include "tt_metal/test_utils/stimulus.hpp" - -using namespace tt; -using namespace tt::test_utils; -using namespace tt::test_utils::df; - -namespace { -namespace CMAKE_UNIQUE_NAMESPACE { -constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet -constexpr std::int32_t MAX_NUM_WORDS = - (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE; -} -} - -namespace unit_tests::erisc::kernels { - -const size_t get_rand_32_byte_aligned_address(const size_t& base, const size_t& max) { - TT_ASSERT(!(base & 0x1F) and !(max & 0x1F)); - size_t word_size = (max >> 5) - (base >> 5); - return (((rand() % word_size) << 5) + base); -} - -/* - * ███╗░░██╗░█████╗░░█████╗░ - * ████╗░██║██╔══██╗██╔══██╗ - * ██╔██╗██║██║░░██║██║░░╚═╝ - * ██║╚████║██║░░██║██║░░██╗ - * ██║░╚███║╚█████╔╝╚█████╔╝ - * ╚═╝░░╚══╝░╚════╝░░╚════╝░ - */ - -bool reader_kernel_no_send( - tt_metal::Device* device, - const size_t& byte_size, - const size_t& eth_l1_byte_address, - const CoreCoord& eth_reader_core, - const tt_metal::EthernetConfig ðernet_config = tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}) { - bool pass = true; - //////////////////////////////////////////////////////////////////////////// - // Application Setup - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program program = tt_metal::Program(); - - tt::tt_metal::InterleavedBufferConfig dram_config{ - .device=device, - .size = byte_size, - .page_size = byte_size, - .buffer_type = tt::tt_metal::BufferType::DRAM - }; - - auto input_dram_buffer = CreateBuffer(dram_config); - uint32_t dram_byte_address = input_dram_buffer->address(); - auto dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_reader_core); - log_debug( - tt::LogTest, - "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}", - device->id(), - byte_size, - dram_noc_xy.str(), - dram_byte_address, - eth_reader_core.str(), - eth_l1_byte_address); - - auto eth_reader_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp", - eth_reader_core, - ethernet_config); - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - - auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); - tt_metal::detail::WriteToBuffer(input_dram_buffer, inputs); - - // Clear expected value at ethernet L1 address - std::vector all_zeros(inputs.size(), 0); - llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_l1_byte_address); - - tt_metal::SetRuntimeArgs( - program, - eth_reader_kernel, - eth_reader_core, - { - (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, - (uint32_t)byte_size, - (uint32_t)eth_l1_byte_address, - }); - - tt_metal::detail::LaunchProgram(device, program); - - auto readback_vec = llrt::read_hex_vec_from_core(device->id(), eth_noc_xy, eth_l1_byte_address, byte_size); - pass &= (readback_vec == inputs); - if (not pass) { - std::cout << "Mismatch at Core: " << eth_noc_xy.str() << std::endl; - } - return pass; -} - -bool writer_kernel_no_receive( - tt_metal::Device* device, - const size_t& byte_size, - const size_t& eth_l1_byte_address, - const CoreCoord& eth_writer_core, - const tt_metal::EthernetConfig ðernet_config = tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}) { - bool pass = true; - //////////////////////////////////////////////////////////////////////////// - // Application Setup - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program program = tt_metal::Program(); - - tt::tt_metal::InterleavedBufferConfig dram_config{ - .device=device, - .size = byte_size, - .page_size = byte_size, - .buffer_type = tt::tt_metal::BufferType::DRAM - }; - - auto output_dram_buffer = CreateBuffer(dram_config); - uint32_t dram_byte_address = output_dram_buffer->address(); - auto dram_noc_xy = output_dram_buffer->noc_coordinates(); - auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_writer_core); - log_debug( - tt::LogTest, - "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}", - device->id(), - byte_size, - eth_writer_core.str(), - eth_l1_byte_address, - dram_noc_xy.str(), - dram_byte_address); - - auto eth_writer_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp", - eth_writer_core, - ethernet_config); - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - - auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); - llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, inputs, eth_l1_byte_address); - - // Clear expected value at ethernet L1 address - std::vector all_zeros(inputs.size(), 0); - tt_metal::detail::WriteToBuffer(output_dram_buffer, all_zeros); - - tt_metal::SetRuntimeArgs( - program, - eth_writer_kernel, - eth_writer_core, - { - (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, - (uint32_t)byte_size, - (uint32_t)eth_l1_byte_address, - }); - - tt_metal::detail::LaunchProgram(device, program); - - auto readback_vec = llrt::read_hex_vec_from_core(device->id(), dram_noc_xy, dram_byte_address, byte_size); - pass &= (readback_vec == inputs); - if (not pass) { - std::cout << "Mismatch at Core: " << dram_noc_xy.str() << std::endl; - } - return pass; -} - -bool noc_reader_and_writer_kernels( - tt_metal::Device *device, - const uint32_t byte_size, - const uint32_t eth_dst_l1_address, - const uint32_t eth_src_l1_address, - const CoreCoord &logical_eth_core, - const tt_metal::EthernetConfig &reader_eth_config, - const tt_metal::EthernetConfig &writer_eth_config) { - bool pass = true; - - tt_metal::Program program = tt_metal::Program(); - - tt_metal::InterleavedBufferConfig dram_config{ - .device=device, - .size = byte_size, - .page_size = byte_size, - .buffer_type = tt_metal::BufferType::DRAM - }; - - auto reader_dram_buffer = CreateBuffer(dram_config); - auto writer_dram_buffer = CreateBuffer(dram_config); - - auto reader_dram_noc_xy = reader_dram_buffer->noc_coordinates(); - auto writer_dram_noc_xy = writer_dram_buffer->noc_coordinates(); - - log_debug( - tt::LogTest, - "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}", - device->id(), - byte_size, - reader_dram_noc_xy.str(), - reader_dram_buffer->address(), - logical_eth_core.str(), - eth_dst_l1_address); - log_debug( - tt::LogTest, - "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}", - device->id(), - byte_size, - logical_eth_core.str(), - eth_src_l1_address, - writer_dram_noc_xy.str(), - writer_dram_buffer->address()); - - auto eth_noc_xy = device->ethernet_core_from_logical_core(logical_eth_core); - - auto eth_reader_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp", - logical_eth_core, - reader_eth_config); - - tt_metal::SetRuntimeArgs( - program, - eth_reader_kernel, - logical_eth_core, - { - (uint32_t)reader_dram_buffer->address(), - (uint32_t)reader_dram_noc_xy.x, - (uint32_t)reader_dram_noc_xy.y, - (uint32_t)byte_size, - (uint32_t)eth_dst_l1_address, - }); - - auto eth_writer_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp", - logical_eth_core, - writer_eth_config); - - tt_metal::SetRuntimeArgs( - program, - eth_writer_kernel, - logical_eth_core, - { - (uint32_t)writer_dram_buffer->address(), - (uint32_t)writer_dram_noc_xy.x, - (uint32_t)writer_dram_noc_xy.y, - (uint32_t)byte_size, - (uint32_t)eth_src_l1_address, - }); - - auto reader_inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); - tt_metal::detail::WriteToBuffer(reader_dram_buffer, reader_inputs); - - auto writer_inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); - llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, writer_inputs, eth_src_l1_address); - - // Clear expected values at output locations - std::vector all_zeros(byte_size / sizeof(uint32_t), 0); - llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_dst_l1_address); - tt_metal::detail::WriteToBuffer(writer_dram_buffer, all_zeros); - - tt_metal::detail::LaunchProgram(device, program); - - auto eth_readback_vec = llrt::read_hex_vec_from_core(device->id(), eth_noc_xy, eth_dst_l1_address, byte_size); - pass &= (eth_readback_vec == reader_inputs); - if (not pass) { - log_info(tt::LogTest, "Mismatch at eth core: {}, eth kernel read incorrect values from DRAM", logical_eth_core.str()); - } - std::vector dram_readback_vec; - tt_metal::detail::ReadFromBuffer(writer_dram_buffer, dram_readback_vec); - pass &= (dram_readback_vec == writer_inputs); - if (not pass) { - log_info(tt::LogTest, "Mismatch at eth core: {}, eth kernel wrote incorrect values to DRAM", logical_eth_core.str()); - } - - return pass; -} - -TEST_F(N300DeviceFixture, EthKernelsNocReadNoSend) { - using namespace CMAKE_UNIQUE_NAMESPACE; - GTEST_SKIP(); - const auto& device_0 = devices_.at(0); - const auto& device_1 = devices_.at(1); - - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - - for (const auto& eth_core : device_0->get_active_ethernet_cores(true)) { - ASSERT_TRUE( - unit_tests::erisc::kernels::reader_kernel_no_send(device_0, WORD_SIZE, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( - device_0, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( - device_0, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); - } - - for (const auto& eth_core : device_1->get_active_ethernet_cores(true)) { - ASSERT_TRUE( - unit_tests::erisc::kernels::reader_kernel_no_send(device_1, WORD_SIZE, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( - device_1, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( - device_1, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); - } -} - -TEST_F(N300DeviceFixture, EthKernelsNocWriteNoReceive) { - using namespace CMAKE_UNIQUE_NAMESPACE; - GTEST_SKIP(); - const auto& device_0 = devices_.at(0); - const auto& device_1 = devices_.at(1); - - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - - for (const auto& eth_core : device_0->get_active_ethernet_cores(true)) { - ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( - device_0, WORD_SIZE, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( - device_0, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( - device_0, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); - } - - for (const auto& eth_core : device_1->get_active_ethernet_cores(true)) { - ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( - device_1, WORD_SIZE, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( - device_1, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( - device_1, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); - } -} - -/* - * - * ███████╗████████╗██╗░░██╗ - * ██╔════╝╚══██╔══╝██║░░██║ - * █████╗░░░░░██║░░░███████║ - * ██╔══╝░░░░░██║░░░██╔══██║ - * ███████╗░░░██║░░░██║░░██║ - * ╚══════╝░░░╚═╝░░░╚═╝░░╚═╝ - */ -bool eth_direct_sender_receiver_kernels( - tt_metal::Device* sender_device, - tt_metal::Device* receiver_device, - const size_t& byte_size, - const size_t& src_eth_l1_byte_address, - const size_t& dst_eth_l1_byte_address, - const CoreCoord& eth_sender_core, - const CoreCoord& eth_receiver_core, - uint32_t num_bytes_per_send = 16) { - bool pass = true; - log_debug( - tt::LogTest, - "Sending {} bytes from device {} eth core {} addr {} to device {} eth core {} addr {}", - byte_size, - sender_device->id(), - eth_sender_core.str(), - src_eth_l1_byte_address, - receiver_device->id(), - eth_receiver_core.str(), - dst_eth_l1_byte_address); - // Generate inputs - auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); - llrt::write_hex_vec_to_core( - sender_device->id(), - sender_device->ethernet_core_from_logical_core(eth_sender_core), - inputs, - src_eth_l1_byte_address); - - // Clear expected value at ethernet L1 address - std::vector all_zeros(inputs.size(), 0); - llrt::write_hex_vec_to_core( - receiver_device->id(), - receiver_device->ethernet_core_from_logical_core(eth_receiver_core), - all_zeros, - dst_eth_l1_byte_address); - - //////////////////////////////////////////////////////////////////////////// - // Sender Device - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program sender_program = tt_metal::Program(); - - auto eth_sender_kernel = tt_metal::CreateKernel( - sender_program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_send.cpp", - eth_sender_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, - .compile_args = {uint32_t(num_bytes_per_send), uint32_t(num_bytes_per_send >> 4)}}); - - tt_metal::SetRuntimeArgs( - sender_program, - eth_sender_kernel, - eth_sender_core, - { - (uint32_t)src_eth_l1_byte_address, - (uint32_t)dst_eth_l1_byte_address, - (uint32_t)byte_size, - }); - - //////////////////////////////////////////////////////////////////////////// - // Receiver Device - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program receiver_program = tt_metal::Program(); - - auto eth_receiver_kernel = tt_metal::CreateKernel( - receiver_program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_receive.cpp", - eth_receiver_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}); // probably want to use NOC_1 here - - tt_metal::SetRuntimeArgs( - receiver_program, - eth_receiver_kernel, - eth_receiver_core, - { - (uint32_t)byte_size, - }); - - //////////////////////////////////////////////////////////////////////////// - // Execute Programs - //////////////////////////////////////////////////////////////////////////// - - std::thread th1 = std::thread([&] { - tt_metal::detail::LaunchProgram(sender_device, sender_program); - }); - std::thread th2 = std::thread([&] { - tt_metal::detail::LaunchProgram(receiver_device, receiver_program); - }); - - th1.join(); - th2.join(); - // tt_metal::ReadFromBuffer(l1_buffer, dest_core_data); - auto readback_vec = llrt::read_hex_vec_from_core( - receiver_device->id(), - receiver_device->ethernet_core_from_logical_core(eth_receiver_core), - dst_eth_l1_byte_address, - byte_size); - pass &= (readback_vec == inputs); - if (not pass) { - std::cout << "Mismatch at Core: " << eth_receiver_core.str() << std::endl; - std::cout << readback_vec[0] << std::endl; - } - return pass; -} - - - -} // namespace unit_tests::erisc::kernels - -TEST_F(N300DeviceFixture, EthKernelsDirectSendChip0ToChip1) { - using namespace CMAKE_UNIQUE_NAMESPACE; - GTEST_SKIP(); - const auto& device_0 = devices_.at(0); - const auto& device_1 = devices_.at(1); - - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - - for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { - auto [device_id, receiver_core] = device_0->get_connected_ethernet_core(sender_core); - if (device_1->id() != device_id) { - continue; - } - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_0, - device_1, - WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_0, - device_1, - 4 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_0, - device_1, - 256 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_0, - device_1, - 1000 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - } -} - -TEST_F(N300DeviceFixture, EthKernelsDirectSendChip1ToChip0) { - using namespace CMAKE_UNIQUE_NAMESPACE; - GTEST_SKIP(); - const auto& device_0 = devices_.at(0); - const auto& device_1 = devices_.at(1); - - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - - for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) { - auto [device_id, receiver_core] = device_1->get_connected_ethernet_core(sender_core); - if (device_0->id() != device_id) { - continue; - } - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_1, - device_0, - WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_1, - device_0, - 4 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_1, - device_0, - 256 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_1, - device_0, - 1000 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - } -} - -TEST_F(DeviceFixture, EthKernelsDirectSendAllConnectedChips) { - using namespace CMAKE_UNIQUE_NAMESPACE; - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - for (const auto& sender_device : devices_) { - for (const auto& receiver_device : devices_) { - if (sender_device->id() == receiver_device->id()) { - continue; - } - for (const auto& sender_core : sender_device->get_active_ethernet_cores(true)) { - auto [device_id, receiver_core] = sender_device->get_connected_ethernet_core(sender_core); - if (receiver_device->id() != device_id) { - continue; - } - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - sender_device, - receiver_device, - WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - sender_device, - receiver_device, - 4 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - sender_device, - receiver_device, - 256 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - sender_device, - receiver_device, - 1000 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - } - } - } -} - -TEST_F(N300DeviceFixture, EthKernelsBidirectionalDirectSend) { - using namespace CMAKE_UNIQUE_NAMESPACE; - const auto& device_0 = devices_.at(0); - const auto& device_1 = devices_.at(1); - - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - - for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { - CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_0, - device_1, - WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_1, - device_0, - WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - receiver_core, - sender_core)); - } - for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { - CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_0, - device_1, - WORD_SIZE * 256, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_1, - device_0, - WORD_SIZE * 256, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - receiver_core, - sender_core)); - } - for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { - CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_0, - device_1, - WORD_SIZE * 1024, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_1, - device_0, - WORD_SIZE * 1024, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - receiver_core, - sender_core)); - } - for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { - CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_0, - device_1, - WORD_SIZE * MAX_NUM_WORDS, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_1, - device_0, - WORD_SIZE * MAX_NUM_WORDS, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - receiver_core, - sender_core)); - } -} - -TEST_F(N300DeviceFixture, EthKernelsRepeatedDirectSends) { - using namespace CMAKE_UNIQUE_NAMESPACE; - const auto& device_0 = devices_.at(0); - const auto& device_1 = devices_.at(1); - - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - - for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { - CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core)); - for (int i = 0; i < 10; i++) { - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_0, - device_1, - WORD_SIZE, - src_eth_l1_byte_address + WORD_SIZE * i, - dst_eth_l1_byte_address + WORD_SIZE * i, - sender_core, - receiver_core)); - } - for (int i = 0; i < 10; i++) { - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - device_1, - device_0, - WORD_SIZE, - src_eth_l1_byte_address + WORD_SIZE * i, - dst_eth_l1_byte_address + WORD_SIZE * i, - receiver_core, - sender_core)); - } - } -} - -TEST_F(N300DeviceFixture, EthKernelsRandomDirectSendTests) { - using namespace CMAKE_UNIQUE_NAMESPACE; - srand(0); - const auto& device_0 = devices_.at(0); - const auto& device_1 = devices_.at(1); - - std::map, std::tuple> connectivity = {}; - for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { - const auto& receiver_core = device_0->get_connected_ethernet_core(sender_core); - connectivity.insert({{0, sender_core}, receiver_core}); - } - for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) { - const auto& receiver_core = device_1->get_connected_ethernet_core(sender_core); - connectivity.insert({{1, sender_core}, receiver_core}); - } - for (int i = 0; i < 1000; i++) { - auto it = connectivity.begin(); - std::advance(it, rand() % (connectivity.size())); - - const auto& send_chip = devices_.at(std::get<0>(it->first)); - CoreCoord sender_core = std::get<1>(it->first); - const auto& receiver_chip = devices_.at(std::get<0>(it->second)); - CoreCoord receiver_core = std::get<1>(it->second); - - const size_t src_eth_l1_byte_address = unit_tests::erisc::kernels::get_rand_32_byte_aligned_address( - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, eth_l1_mem::address_map::MAX_L1_LOADING_SIZE); - const size_t dst_eth_l1_byte_address = unit_tests::erisc::kernels::get_rand_32_byte_aligned_address( - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, eth_l1_mem::address_map::MAX_L1_LOADING_SIZE); - - int max_words = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - - std::max(src_eth_l1_byte_address, dst_eth_l1_byte_address)) / - WORD_SIZE; - int num_words = rand() % max_words + 1; - - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - send_chip, - receiver_chip, - WORD_SIZE * num_words, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - } -} -TEST_F(N300DeviceFixture, EthKernelsRandomEthPacketSizeDirectSendTests) { - srand(0); - const auto& device_0 = devices_.at(0); - const auto& device_1 = devices_.at(1); - - std::map, std::tuple> connectivity = {}; - for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) { - const auto& receiver_core = device_0->get_connected_ethernet_core(sender_core); - connectivity.insert({{0, sender_core}, receiver_core}); - } - for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) { - const auto& receiver_core = device_1->get_connected_ethernet_core(sender_core); - connectivity.insert({{1, sender_core}, receiver_core}); - } - std::vector num_bytes_per_send_test_vals = { - 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}; - for (const auto& num_bytes_per_send : num_bytes_per_send_test_vals) { - log_info(tt::LogTest, "Random eth send tests with {} bytes per packet", num_bytes_per_send); - for (int i = 0; i < 10; i++) { - auto it = connectivity.begin(); - std::advance(it, rand() % (connectivity.size())); - - const auto& send_chip = devices_.at(std::get<0>(it->first)); - CoreCoord sender_core = std::get<1>(it->first); - const auto& receiver_chip = devices_.at(std::get<0>(it->second)); - CoreCoord receiver_core = std::get<1>(it->second); - - const size_t src_eth_l1_byte_address = unit_tests::erisc::kernels::get_rand_32_byte_aligned_address( - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, - eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - 65536); - const size_t dst_eth_l1_byte_address = unit_tests::erisc::kernels::get_rand_32_byte_aligned_address( - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, - eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - 65536); - - int max_words = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - - std::max(src_eth_l1_byte_address, dst_eth_l1_byte_address)) / - num_bytes_per_send; - int num_words = rand() % max_words + 1; - - ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - send_chip, - receiver_chip, - num_bytes_per_send * num_words, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core, - num_bytes_per_send)); - } - } -} - -// TODO #14640: Run this on WH when i$ flush issue is addressed -TEST_F(BlackholeSingleCardFixture, EthKernelOnIdleErisc0) { - using namespace CMAKE_UNIQUE_NAMESPACE; - uint32_t eth_l1_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED); - tt_metal::EthernetConfig noc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_0}; - tt_metal::EthernetConfig noc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_1, .processor = tt_metal::DataMovementProcessor::RISCV_0}; - - for (const auto& eth_core : device_->get_inactive_ethernet_cores()) { - ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( - device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config)); - ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( - device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config)); - ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( - device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config)); - ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( - device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config)); - } -} - -TEST_F(BlackholeSingleCardFixture, EthKernelOnIdleErisc1) { - using namespace CMAKE_UNIQUE_NAMESPACE; - uint32_t eth_l1_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED); - tt_metal::EthernetConfig noc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_1}; - tt_metal::EthernetConfig noc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_1, .processor = tt_metal::DataMovementProcessor::RISCV_1}; - - for (const auto& eth_core : device_->get_inactive_ethernet_cores()) { - ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( - device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config)); - ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send( - device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config)); - ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( - device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config)); - ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive( - device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config)); - } -} - -TEST_F(BlackholeSingleCardFixture, EthKernelOnBothIdleEriscs) { - using namespace CMAKE_UNIQUE_NAMESPACE; - uint32_t read_write_size_bytes = WORD_SIZE * 2048; - uint32_t reader_dst_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED); - uint32_t writer_src_address = reader_dst_address + read_write_size_bytes; - tt_metal::EthernetConfig erisc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_0}; - tt_metal::EthernetConfig erisc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_1}; - - for (const auto& eth_core : device_->get_inactive_ethernet_cores()) { - ASSERT_TRUE(unit_tests::erisc::kernels::noc_reader_and_writer_kernels( - device_, read_write_size_bytes, reader_dst_address, writer_src_address, eth_core, erisc0_ethernet_config, erisc1_ethernet_config - )); - erisc0_ethernet_config.noc = tt_metal::NOC::NOC_1; - erisc1_ethernet_config.noc = tt_metal::NOC::NOC_1; - ASSERT_TRUE(unit_tests::erisc::kernels::noc_reader_and_writer_kernels( - device_, read_write_size_bytes, reader_dst_address, writer_src_address, eth_core, erisc0_ethernet_config, erisc1_ethernet_config - )); - } -} diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/unit_tests/ethernet/erisc_app_direct_send.cpp deleted file mode 100644 index 01f63153840..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests/ethernet/erisc_app_direct_send.cpp +++ /dev/null @@ -1,278 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include -#include -#include - -#include "n300_device_fixture.hpp" -#include "tt_metal/common/logger.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/comparison.hpp" -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" -#include "tt_metal/test_utils/stimulus.hpp" - -namespace { -namespace CMAKE_UNIQUE_NAMESPACE { -constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet -constexpr std::int32_t MAX_NUM_WORDS = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_SIZE / WORD_SIZE; - -struct erisc_info_t { - volatile uint32_t num_bytes; - volatile uint32_t mode; - volatile uint32_t reserved_0_; - volatile uint32_t reserved_1_; - volatile uint32_t bytes_done; - volatile uint32_t reserverd_2_; - volatile uint32_t reserverd_3_; - volatile uint32_t reserverd_4_; -}; -} -} - -using namespace tt; -using namespace tt::test_utils; -using namespace tt::test_utils::df; - -namespace unit_tests::erisc::direct_send { -// Tests ethernet direct send/receive from ERISC_L1_UNRESERVED_BASE -bool send_over_eth( - tt_metal::Device* sender_device, - tt_metal::Device* receiver_device, - const CoreCoord& sender_core, - const CoreCoord& receiver_core, - const size_t& byte_size) { - tt::log_debug( - tt::LogTest, - "Running direct send test with sender chip {} core {}, receiver chip {} core {}, sending {} bytes", - sender_device->id(), - sender_core.str(), - receiver_device->id(), - receiver_core.str(), - byte_size); - std::vector eth_cores = { - CoreCoord(9, 0), - CoreCoord(1, 0), - CoreCoord(8, 0), - CoreCoord(2, 0), - CoreCoord(9, 6), - CoreCoord(1, 6), - CoreCoord(8, 6), - CoreCoord(2, 6), - CoreCoord(7, 0), - CoreCoord(3, 0), - CoreCoord(6, 0), - CoreCoord(4, 0), - CoreCoord(7, 6), - CoreCoord(3, 6), - CoreCoord(6, 6), - CoreCoord(4, 6)}; - - // Disable all eth core runtime app flags, zero out data write counter - std::vector run_test_app_flag = {0x0}; - for (const auto& eth_core : eth_cores) { - llrt::write_hex_vec_to_core( - sender_device->id(), eth_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); - llrt::write_hex_vec_to_core( - receiver_device->id(), eth_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); - std::vector zero = {0, 0, 0, 0, 0, 0, 0, 0}; - llrt::write_hex_vec_to_core( - sender_device->id(), eth_core, zero, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); - llrt::write_hex_vec_to_core( - receiver_device->id(), eth_core, zero, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); - } - - // TODO: is it possible that receiver core app is stil running when we push inputs here??? - auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); - llrt::write_hex_vec_to_core( - sender_device->id(), sender_core, inputs, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); - - // Zero out receiving address to ensure no stale data is causing tests to pass - std::vector all_zeros(inputs.size(), 0); - llrt::write_hex_vec_to_core( - receiver_device->id(), receiver_core, all_zeros, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); - - std::vector args_0 = {uint32_t(byte_size), 0}; - llrt::write_hex_vec_to_core(sender_device->id(), sender_core, args_0, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); - std::vector args_1 = {uint32_t(byte_size), 1}; - llrt::write_hex_vec_to_core(receiver_device->id(), receiver_core, args_1, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); - - // TODO: this should be updated to use kernel api - uint32_t active_eth_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH); - ll_api::memory binary_mem_send = llrt::get_risc_binary( - sender_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0); - ll_api::memory binary_mem_receive = llrt::get_risc_binary( - receiver_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0); - - for (const auto& eth_core : eth_cores) { - llrt::write_hex_vec_to_core( - sender_device->id(), eth_core, binary_mem_send.data(), eth_l1_mem::address_map::FIRMWARE_BASE); - llrt::write_hex_vec_to_core( - receiver_device->id(), eth_core, binary_mem_receive.data(), eth_l1_mem::address_map::FIRMWARE_BASE); - } - - // Activate sender core runtime app - run_test_app_flag = {0x1}; - // send remote first, otherwise eth core may be blocked, very ugly for now... - if (receiver_device->id() == 1) { - llrt::write_hex_vec_to_core( - 1, receiver_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); - } else { - llrt::write_hex_vec_to_core(1, sender_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); - } - if (sender_device->id() == 0) { - llrt::write_hex_vec_to_core(0, sender_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); - } else { - llrt::write_hex_vec_to_core( - 0, receiver_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG); - } - - bool pass = true; - auto readback_vec = llrt::read_hex_vec_from_core( - receiver_device->id(), receiver_core, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, byte_size); - pass &= (readback_vec == inputs); - - return pass; -} - -} // namespace unit_tests::erisc::direct_send - -TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip0ToChip1) { - using namespace CMAKE_UNIQUE_NAMESPACE; - GTEST_SKIP(); - ASSERT_TRUE(this->num_devices_ == 2); - const auto& device_0 = devices_.at(0); - const auto& device_1 = devices_.at(1); - CoreCoord sender_core_0 = CoreCoord(9, 6); - CoreCoord sender_core_1 = CoreCoord(1, 6); - - CoreCoord receiver_core_0 = CoreCoord(9, 0); - CoreCoord receiver_core_1 = CoreCoord(1, 0); - - ASSERT_TRUE( - unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE)); - ASSERT_TRUE( - unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 256)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 256)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 1024)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 1024)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS)); -} - -TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip1ToChip0) { - using namespace CMAKE_UNIQUE_NAMESPACE; - GTEST_SKIP(); - ASSERT_TRUE(this->num_devices_ == 2); - const auto& device_0 = devices_.at(0); - const auto& device_1 = devices_.at(1); - CoreCoord sender_core_0 = CoreCoord(9, 0); - CoreCoord sender_core_1 = CoreCoord(1, 0); - - CoreCoord receiver_core_0 = CoreCoord(9, 6); - CoreCoord receiver_core_1 = CoreCoord(1, 6); - - ASSERT_TRUE( - unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE)); - ASSERT_TRUE( - unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * 256)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * 256)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * 1024)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * 1024)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS)); -} - -TEST_F(N300DeviceFixture, BidirectionalEthCoreDirectSend) { - using namespace CMAKE_UNIQUE_NAMESPACE; - GTEST_SKIP(); - ASSERT_TRUE(this->num_devices_ == 2); - const auto& device_0 = devices_.at(0); - const auto& device_1 = devices_.at(1); - CoreCoord sender_core_0 = CoreCoord(9, 6); - CoreCoord sender_core_1 = CoreCoord(1, 6); - - CoreCoord receiver_core_0 = CoreCoord(9, 0); - CoreCoord receiver_core_1 = CoreCoord(1, 0); - - ASSERT_TRUE( - unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE)); - ASSERT_TRUE( - unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE)); - ASSERT_TRUE( - unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE)); - ASSERT_TRUE( - unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 256)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * 256)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 256)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * 256)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 1024)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * 1024)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 1024)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * 1024)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * MAX_NUM_WORDS)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS)); - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * MAX_NUM_WORDS)); -} - -TEST_F(N300DeviceFixture, RandomDirectSendTests) { - using namespace CMAKE_UNIQUE_NAMESPACE; - GTEST_SKIP(); - srand(0); - ASSERT_TRUE(this->num_devices_ == 2); - - std::map, std::pair> connectivity = { - {{0, CoreCoord(9, 6)}, {1, CoreCoord(9, 0)}}, - {{1, CoreCoord(9, 0)}, {0, CoreCoord(9, 6)}}, - {{0, CoreCoord(1, 6)}, {1, CoreCoord(1, 0)}}, - {{1, CoreCoord(1, 0)}, {0, CoreCoord(1, 6)}}}; - for (int i = 0; i < 1000; i++) { - auto it = connectivity.begin(); - std::advance(it, rand() % (connectivity.size())); - - const auto& send_chip = devices_.at(std::get<0>(it->first)); - CoreCoord sender_core = std::get<1>(it->first); - const auto& receiver_chip = devices_.at(std::get<0>(it->second)); - CoreCoord receiver_core = std::get<1>(it->second); - int num_words = 0; - if constexpr (MAX_NUM_WORDS != 0) { - num_words = rand() % MAX_NUM_WORDS + 1; - } - - ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth( - send_chip, receiver_chip, sender_core, receiver_core, WORD_SIZE * num_words)); - } -} diff --git a/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp b/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp deleted file mode 100644 index f70039820a5..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp +++ /dev/null @@ -1,260 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "gtest/gtest.h" -#include "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h" -#include "tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/common/math.hpp" - -using std::vector; -using namespace tt::tt_metal; - -// TODO: Remove dependency on "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h" and remove globals -bool debug_g = false; -// Page size 4096 bytes -uint32_t log_dispatch_buffer_page_size_g = 12; -uint32_t dispatch_buffer_page_size_g = 1 << log_dispatch_buffer_page_size_g; -bool use_coherent_data_g = false; -uint32_t hugepage_buffer_size_g = 256 * 1024 * 1024; -uint32_t dev_hugepage_base = dispatch_buffer_page_size_g; -std::pair default_ptrs = std::make_pair(dev_hugepage_base, 0); -uint32_t hugepage_issue_buffer_size_g; - -inline void gen_dispatcher_pad_to_page(vector& cmds, uint32_t page_size) { - uint32_t num_words_in_page = page_size / sizeof(uint32_t); - uint32_t num_pad_words = tt::round_up(cmds.size(), num_words_in_page) - cmds.size(); - for (uint32_t i = 0; i < num_pad_words; ++i) { - cmds.push_back(0); - } -} - -inline bool validate_results( - std::vector& dev_data, - uint32_t num_words, - void *host_hugepage_base, - uint32_t dev_hugepage_base, - uint32_t dev_hugepage_start, - uint32_t hugepage_buffer_size_g) { - bool failed = false; - - log_info(tt::LogTest, "Validating {} bytes from hugepage", num_words * sizeof(uint32_t)); - - uint32_t *results = ((uint32_t *)host_hugepage_base); // 8 = 32B / sizeof(uint32_t) - uint32_t dev_hugepage_start_diff_uint = (dev_hugepage_start - dev_hugepage_base) / sizeof(uint32_t); - uint32_t hugepage_buffer_size_g_uint = hugepage_buffer_size_g / sizeof(uint32_t); - int fail_count = 0; - - for (int i = 0; i < num_words; ++i) { - uint32_t hugepage_idx = (dev_hugepage_start_diff_uint + i) % hugepage_buffer_size_g_uint; - if (results[hugepage_idx] != dev_data[i]) { - if (!failed) { - tt::log_fatal("Data mismatch"); - fprintf(stderr, "First 20 failures for each core: [idx] expected->read\n"); - } - if (fail_count == 0) { - fprintf(stderr, "Failures reading hugepage\n"); - } - - fprintf(stderr, " [%02d] 0x%08x->0x%08x\n", i, (unsigned int)dev_data[i], (unsigned int)results[hugepage_idx]); - - failed = true; - fail_count++; - if (fail_count > 20) { - break; - } - } - } - - return !failed; -} - -namespace local_test_functions { - -bool test_write_host(Device *device, uint32_t data_size, std::pair write_ptr_start = default_ptrs, std::pair read_ptr_start = default_ptrs, std::optional> read_ptr_update = std::nullopt) { - CoreCoord spoof_prefetch_core = {0, 0}; - CoreCoord dispatch_core = {4, 0}; - CoreCoord phys_spoof_prefetch_core = device->worker_core_from_logical_core(spoof_prefetch_core); - CoreCoord phys_dispatch_core = device->worker_core_from_logical_core(dispatch_core); - - tt::tt_metal::Program program = tt::tt_metal::CreateProgram(); - - uint32_t dispatch_buffer_size_blocks_g = 4; - - uint32_t total_size = data_size + sizeof(CQDispatchCmd); - - // NOTE: this test hijacks hugepage - // Start after first page since ptrs are at the start of hugepage - - void *host_hugepage_base; - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id()); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id()); - host_hugepage_base = (void *)tt::Cluster::instance().host_dma_address(0, mmio_device_id, channel); - host_hugepage_base = (void *)((uint8_t *)host_hugepage_base + dev_hugepage_base); - - uint32_t l1_unreserved_base = devices_.at(id)->get_base_allocator_addr(HalMemType::L1); - uint32_t l1_buf_base = align(l1_unreserved_base, dispatch_buffer_page_size_g); - - std::vector dispatch_cmds; - CQDispatchCmd cmd; - memset(&cmd, 0, sizeof(CQDispatchCmd)); - cmd.base.cmd_id = CQ_DISPATCH_CMD_WRITE_LINEAR_H_HOST; - cmd.write_linear_host.length = data_size + sizeof(CQDispatchCmd); - add_dispatcher_cmd(dispatch_cmds, cmd, data_size); - gen_dispatcher_pad_to_page(dispatch_cmds, dispatch_buffer_page_size_g); - uint32_t dev_output_num_words = total_size / sizeof(uint32_t); - gen_dispatcher_terminate_cmd(dispatch_cmds); - - uint32_t cmd_cb_pages = tt::div_up(dispatch_cmds.size() * sizeof(uint32_t), dispatch_buffer_page_size_g); - - // Make full blocks - uint32_t dispatch_buffer_pages = tt::round_up(cmd_cb_pages, dispatch_buffer_size_blocks_g); - uint32_t dispatch_buffer_size_g = dispatch_buffer_pages * dispatch_buffer_page_size_g; - TT_FATAL(l1_buf_base + dispatch_buffer_size_g <= device->l1_size_per_core(), "Does not fit in L1"); - - std::vector write_ptr_val = {(write_ptr_start.first >> 4) | (write_ptr_start.second << 31)}; - std::vector read_ptr_val = {(read_ptr_start.first >> 4) | (read_ptr_start.second << 31)}; - - uint32_t completion_q_wr_ptr = dispatch_constants::get(CoreType::WORKER).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR); - uint32_t completion_q_rd_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD); - // Write the read and write ptrs - tt::llrt::write_hex_vec_to_core( - device->id(), phys_dispatch_core, write_ptr_val, completion_q_wr_ptr); - tt::llrt::write_hex_vec_to_core( - device->id(), phys_dispatch_core, read_ptr_val, completion_q_rd_ptr); - - tt::llrt::write_hex_vec_to_core(device->id(), phys_spoof_prefetch_core, dispatch_cmds, l1_buf_base); - tt::Cluster::instance().l1_barrier(device->id()); - - const uint32_t spoof_prefetch_core_sem_0_id = - tt::tt_metal::CreateSemaphore(program, {spoof_prefetch_core}, dispatch_buffer_pages); - const uint32_t dispatch_core_sem_id = tt::tt_metal::CreateSemaphore(program, {dispatch_core}, 0); - TT_ASSERT(spoof_prefetch_core_sem_0_id == dispatch_core_sem_id); - const uint32_t dispatch_cb_sem = spoof_prefetch_core_sem_0_id; - - const uint32_t spoof_prefetch_core_sem_1_id = tt::tt_metal::CreateSemaphore(program, {spoof_prefetch_core}, 0); - const uint32_t prefetch_sync_sem = spoof_prefetch_core_sem_1_id; - - std::vector dispatch_compile_args = { - l1_buf_base, - log_dispatch_buffer_page_size_g, - dispatch_buffer_pages, - dispatch_cb_sem, - dispatch_cb_sem, // ugly, share an address - dispatch_buffer_size_blocks_g, - prefetch_sync_sem, - default_ptrs.second, - dev_hugepage_base, - hugepage_buffer_size_g, - 0, // unused downstream_cb_base - 0, // unused downstream_cb_size - 0, // unused my_downstream_cb_sem_id - 0, // unused downstream_cb_sem_id - 0, // unused split_dispatch_page_preamble_size - true, - true}; - std::vector spoof_prefetch_compile_args = { - l1_buf_base, - log_dispatch_buffer_page_size_g, - dispatch_buffer_pages, - dispatch_cb_sem, - l1_buf_base, - cmd_cb_pages, - // Hardcode page_batch_size to 1 to force the inner loops to only run once - 1, - prefetch_sync_sem, - }; - - std::map prefetch_defines = { - {"MY_NOC_X", std::to_string(phys_spoof_prefetch_core.x)}, - {"MY_NOC_Y", std::to_string(phys_spoof_prefetch_core.y)}, - {"DISPATCH_NOC_X", std::to_string(phys_dispatch_core.x)}, - {"DISPATCH_NOC_Y", std::to_string(phys_dispatch_core.y)}, - }; - - auto sp1 = tt::tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/spoof_prefetch.cpp", - {spoof_prefetch_core}, - tt::tt_metal::DataMovementConfig{ - .processor = tt::tt_metal::DataMovementProcessor::RISCV_1, - .noc = tt::tt_metal::NOC::RISCV_0_default, - .compile_args = spoof_prefetch_compile_args, - .defines = prefetch_defines}); - - // Hardcode outer loop to 1 - vector args = {1}; - tt::tt_metal::SetRuntimeArgs(program, sp1, spoof_prefetch_core, args); - - constexpr NOC my_noc_index = NOC::NOC_0; - constexpr NOC dispatch_upstream_noc_index = NOC::NOC_1; - - configure_kernel_variant(program, - "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp", - dispatch_compile_args, - dispatch_core, - phys_dispatch_core, - phys_spoof_prefetch_core, - {0, 0}, - device, - my_noc_index, - my_noc_index, - my_noc_index); - - // Need a separate thread for SD - if (read_ptr_update.has_value()) { - std::thread t1 ([&]() { - uint64_t run_mailbox_address = GET_MAILBOX_ADDRESS_HOST(launch.run); - std::vector run_mailbox_read_val; - uint8_t run; - do { - run_mailbox_read_val = tt::llrt::read_hex_vec_from_core(device->id(), phys_dispatch_core, run_mailbox_address & ~0x3, sizeof(uint32_t)); - run = run_mailbox_read_val[0] >> (8 * (offsetof(launch_msg_t, run) & 3)); - } while (run != RUN_MSG_GO); - sleep(1); - std::vector read_ptr_update_val = {(read_ptr_update.value().first >> 4) | (read_ptr_update.value().second << 31)}; - tt::llrt::write_hex_vec_to_core( - device->id(), phys_dispatch_core, read_ptr_update_val, completion_q_rd_ptr); - }); - tt::tt_metal::detail::LaunchProgram(device, program); - t1.join(); - } else { - tt::tt_metal::detail::LaunchProgram(device, program); - } - - // Validation - bool pass = validate_results( - dispatch_cmds, dev_output_num_words, host_hugepage_base, dev_hugepage_base, write_ptr_start.first, hugepage_buffer_size_g); - return pass; -} - -} // end namespace local_test_functions - -namespace basic_tests { - -TEST_F(DeviceSingleCardFixture, TestWriteHostBasic) { - EXPECT_TRUE(local_test_functions::test_write_host(device_, dispatch_buffer_page_size_g - sizeof(CQDispatchCmd))); - EXPECT_TRUE(local_test_functions::test_write_host(device_, dispatch_buffer_page_size_g)); - EXPECT_TRUE(local_test_functions::test_write_host(device_, 256)); - EXPECT_TRUE(local_test_functions::test_write_host(device_, 3 * dispatch_buffer_page_size_g)); - EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g)); -} - -TEST_F(DeviceSingleCardFixture, TestWriteHostWrap) { - EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {hugepage_buffer_size_g - 1 * dispatch_buffer_page_size_g + dev_hugepage_base, 0}, {hugepage_buffer_size_g - 1 * dispatch_buffer_page_size_g + dev_hugepage_base, 0})); - EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {hugepage_buffer_size_g - 2 * dispatch_buffer_page_size_g + dev_hugepage_base, 0}, {hugepage_buffer_size_g - 2 * dispatch_buffer_page_size_g + dev_hugepage_base, 0})); - EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {hugepage_buffer_size_g - 3 * dispatch_buffer_page_size_g + dev_hugepage_base, 0}, {hugepage_buffer_size_g - 3 * dispatch_buffer_page_size_g + dev_hugepage_base, 0})); -} - -TEST_F(DeviceSingleCardFixture, TestWriteHostStall) { - EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {dev_hugepage_base, 1}, {dev_hugepage_base, 0}, std::make_pair(dev_hugepage_base + 11 * dispatch_buffer_page_size_g, 0))); - EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {dev_hugepage_base, 1}, {dev_hugepage_base + 5 * dispatch_buffer_page_size_g, 0}, std::make_pair(dev_hugepage_base + 11 * dispatch_buffer_page_size_g, 0))); - EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {dev_hugepage_base + 3 * dispatch_buffer_page_size_g, 1}, {dev_hugepage_base + 3 * dispatch_buffer_page_size_g, 0}, std::make_pair(dev_hugepage_base + 3 * dispatch_buffer_page_size_g, 1))); -} - -} // namespace basic_tests diff --git a/tests/tt_metal/tt_metal/unit_tests/tests_main.cpp b/tests/tt_metal/tt_metal/unit_tests/tests_main.cpp deleted file mode 100644 index 1e42f41a46c..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests/tests_main.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "gtest/gtest.h" diff --git a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_kernel_creation.cpp b/tests/tt_metal/tt_metal/unit_tests_common/basic/test_kernel_creation.cpp deleted file mode 100644 index 68dc974545a..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_kernel_creation.cpp +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" -#include "gtest/gtest.h" -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/impl/dispatch/command_queue.hpp" -#include "tt_metal/common/logger.hpp" - - -using namespace tt; - -// Ensures we can successfully create kernels on available compute grid -TEST_F(CommonFixture, CreateKernelsOnComputeCores) { - for (unsigned int id = 0; id < devices_.size(); id++) { - tt_metal::Program program = CreateProgram(); - CoreCoord compute_grid = devices_.at(id)->compute_with_storage_grid_size(); - EXPECT_NO_THROW( - auto test_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", - CoreRange(CoreCoord(0, 0), CoreCoord(compute_grid.x, compute_grid.y)), - {.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default} - ); - ); - } -} - -// Ensure we cannot create kernels on storage cores -TEST_F(CommonFixture, CreateKernelsOnStorageCores) { - for (unsigned int id=0; id < devices_.size(); id++) { - if (devices_.at(id)->storage_only_cores().empty()) { - GTEST_SKIP() << "This test only runs on devices with storage only cores"; - } - CoreRangeSet storage_core_range_set = CoreRangeSet(devices_.at(id)->storage_only_cores()); - EXPECT_ANY_THROW( - auto test_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", - storage_core_range_set, - {.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default} - ); - ); - } -} - -TEST_F(CommonFixture, CreateKernelsOnDispatchCores) { - if (getenv("TT_METAL_SLOW_DISPATCH_MODE")) { - GTEST_SKIP() << "This test is only supported in fast dispatch mode"; - } - for (unsigned int id=0; id < devices_.size(); id++) { - std::vector dispatch_cores = tt::get_logical_dispatch_cores(device->id(), device->num_hw_cqs()); - CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - std::set dispatch_core_range_set(dispatch_cores.begin(), dispatch_cores.end()); - - if (dispatch_core_type == CoreType::WORKER) { - EXPECT_ANY_THROW( - auto test_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", - dispatch_core_range_set, - {.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default} - ); - ); - } else if (dispatch_core_type == CoreType::ETH) { - EXPECT_ANY_THROW( - auto test_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/misc/erisc_print.cpp", - dispatch_core_range_set, - {.noc = tt_metal::NOC::NOC_0, .eth_mode = Eth::IDLE} - ); - ); - } - } -} diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp deleted file mode 100644 index 829a9feb140..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "common_fixture.hpp" -#include "impl/debug/dprint_server.hpp" -#include "tt_metal/common/core_descriptor.hpp" - -// A version of CommonFixture with DPrint enabled on all cores. -class DPrintFixture: public CommonFixture { -public: - inline static const string dprint_file_name = "gtest_dprint_log.txt"; - - // A function to run a program, according to which dispatch mode is set. - void RunProgram(Device* device, Program& program) { - // Only difference is that we need to wait for the print server to catch - // up after running a test. - CommonFixture::RunProgram(device, program); - tt::DprintServerAwait(); - } - -protected: - // Running with dprint + watcher enabled can make the code size blow up, so let's force watcher - // disabled for DPRINT tests. - bool watcher_previous_enabled; - void SetUp() override { - // The core range (physical) needs to be set >= the set of all cores - // used by all tests using this fixture, so set dprint enabled for - // all cores and all devices - tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, true); - tt::llrt::OptionsG.set_feature_all_cores( - tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, tt::llrt::RunTimeDebugClassWorker); - tt::llrt::OptionsG.set_feature_all_cores( - tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, tt::llrt::RunTimeDebugClassWorker); - tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, true); - // Send output to a file so the test can check after program is run. - tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, dprint_file_name); - tt::llrt::OptionsG.set_test_mode_enabled(true); - watcher_previous_enabled = tt::llrt::OptionsG.get_watcher_enabled(); - tt::llrt::OptionsG.set_watcher_enabled(false); - - ExtraSetUp(); - - // Parent class initializes devices and any necessary flags - CommonFixture::SetUp(); - } - - void TearDown() override { - // Parent class tears down devices - CommonFixture::TearDown(); - - // Remove the DPrint output file after the test is finished. - std::remove(dprint_file_name.c_str()); - - // Reset DPrint settings - tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, {}); - tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false); - tt::llrt::OptionsG.set_feature_all_cores( - tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, tt::llrt::RunTimeDebugClassNoneSpecified); - tt::llrt::OptionsG.set_feature_all_cores( - tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, tt::llrt::RunTimeDebugClassNoneSpecified); - tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false); - tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, ""); - tt::llrt::OptionsG.set_test_mode_enabled(false); - tt::llrt::OptionsG.set_watcher_enabled(watcher_previous_enabled); - } - - void RunTestOnDevice( - const std::function& run_function, - Device* device - ) { - auto run_function_no_args = [=]() { - run_function(this, device); - }; - CommonFixture::RunTestOnDevice(run_function_no_args, device); - tt::DPrintServerClearLogFile(); - tt::DPrintServerClearSignals(); - } - - // Override this function in child classes for additional setup commands between DPRINT setup - // and device creation. - virtual void ExtraSetUp() {} -}; - -// For usage by tests that need the dprint server devices disabled. -class DPrintFixtureDisableDevices: public DPrintFixture { -protected: - void ExtraSetUp() override { - // For this test, mute each devices using the environment variable - tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false); - tt::llrt::OptionsG.set_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint, {}); - } -}; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt deleted file mode 100644 index 75cc62aeabb..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt +++ /dev/null @@ -1,40 +0,0 @@ -set(UNIT_TESTS_FD_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_CommandQueue.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueProgram.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueTrace.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_events.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_HostAsyncCQ.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_worker_config_buffer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/compute/sfpu/sfpu_compute.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/multichip/test_device_pool.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/multichip/test_eth_EnqueueProgram.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/multichip/test_eth_ring_gather_EnqueueProgram.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/pipelining/basic_pipeline.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/streams/test_autonomous_relay_streams.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/sub_device/test_sub_device.cpp -) - -add_executable( - unit_tests_fast_dispatch - ${UNIT_TESTS_FD_SRC} - $ -) -TT_ENABLE_UNITY_BUILD(unit_tests_fast_dispatch) - -target_link_libraries(unit_tests_fast_dispatch PUBLIC test_metal_common_libs) -target_include_directories( - unit_tests_fast_dispatch - PRIVATE - ${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/tt_metal - ${PROJECT_SOURCE_DIR}/tests - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/common -) -set_target_properties( - unit_tests_fast_dispatch - PROPERTIES - RUNTIME_OUTPUT_DIRECTORY - ${PROJECT_BINARY_DIR}/test/tt_metal -) diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/README.md b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/README.md deleted file mode 100644 index 55aab607296..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# Summary -Unit testing uses the doctest framework. See https://github.com/doctest/doctest/ -Generally, there are three main levels of organization: -* TEST_SUITE - Used to group main areas of tests -* TEST_CASE - How Test case and sub-case gets split up is at test-writer discretion, but see the test_case section -* SUB_CASE - - -## Build && Execution -### Build -`make tests/tt_metal/unit_tests` -### Get Help -`./build/test/tt_metal/unit_tests --help` -### Execute all tests -`./build/test/tt_metal/unit_tests` -### Execute filtered test-suite -`./build/test/tt_metal/unit_tests -ts="*Sfpu*"` -### List all test-suite with filter -`./build/test/tt_metal/unit_tests -ts="*Sfpu*" -lts` - -## Folder Structure -General structure of the tests are as follows, more sub-folders can be added -
-Directory Structure - Please add any new-tests to a corresponding folder. -
-
-tt_metal/unit_tests/
-  > test_main.cpp
-  > basic/
-    > # Any basic test files can exist here, will be automatically added to test_main
-  > common/
-    > # Used to hold any common structures across all test suites like fixtures
-  > dram/
-    > # Any dram unit/stress test files can exist here, will be automatically added to test_main
-  > compute/
-    > # Any basic test files can exist here, will be automatically added to test_main
-  > new_folders/
-    > # Any test files can exist here, will be automatically added to test_main
-test_utils/
-  > comparison.cpp # Useful utils for comparing, see example usages in unit tests
-  > print_helpers.cpp # Useful utils for printin
-  > stimulus.cpp # Useful utils for generating random vectors or specific vectors, see example usages in unit tests
-  > tilization.cpp # Useful utils for converting between tiled vectors or not, see example usages in unit tests
-
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp deleted file mode 100644 index 7aa1811ecd5..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "tt_metal/host_api.hpp" -#include "tt_metal/common/bfloat16.hpp" -#include "tt_metal/impl/buffers/buffer.hpp" - -struct TestBufferConfig { - uint32_t num_pages; - uint32_t page_size; - tt::tt_metal::BufferType buftype; -}; - -inline std::pair, std::vector> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) { - // This function just enqueues a buffer (which should be large in the config) - // write as a precursor to testing the wrap mechanism - size_t buf_size = config.num_pages * config.page_size; - auto buffer = Buffer::create(device, buf_size, config.page_size, config.buftype); - - std::vector src = create_random_vector_of_bfloat16( - buf_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); - - EnqueueWriteBuffer(cq, *buffer, src, false); - return std::make_pair(std::move(buffer), src); -} - -inline bool does_device_have_active_eth_cores(const Device *device) { - return !(device->get_active_ethernet_cores(true).empty()); -} diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp deleted file mode 100644 index 8017f70fb27..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "command_queue_fixture.hpp" -#include "command_queue_test_utils.hpp" -#include "gtest/gtest.h" -#include "tt_metal/host_api.hpp" -#include "tt_metal/impl/device/device.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/test_utils/stimulus.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" - -using namespace tt::tt_metal; - -namespace host_tests { - -namespace multi_device_tests { -TEST_F(CommandQueueMultiDeviceFixture, DISABLED_TestAccessCommandQueue) { - for (unsigned int device_id = 0; device_id < num_devices_; device_id++) { - EXPECT_NO_THROW(devices_[device_id]->command_queue()); - } -} - -TEST(FastDispatchHostSuite, TestCannotAccessCommandQueueForClosedDevice) { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch) { - TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); - GTEST_SKIP(); - } - const unsigned int device_id = 0; - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - Device* device = tt::tt_metal::CreateDevice(device_id, tt::llrt::OptionsG.get_num_hw_cqs(), DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - - EXPECT_NO_THROW(device->command_queue()); - CloseDevice(device); - EXPECT_ANY_THROW(device->command_queue()); -} - -TEST_F(CommandQueueMultiDeviceFixture, DISABLED_TestDirectedLoopbackToUniqueHugepage) { - std::unordered_map> golden_data; - - const uint32_t byte_size = 2048 * 16; - const uint64_t address = 0; - - for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) { - std::vector data = - tt::test_utils::generate_uniform_random_vector(0, UINT32_MAX, byte_size / sizeof(uint32_t)); - - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); - tt::Cluster::instance().write_sysmem(data.data(), data.size() * sizeof(uint32_t), address, mmio_device_id, channel); - - golden_data[device_id] = data; - } - - std::vector readback_data; - readback_data.resize(byte_size / sizeof(uint32_t)); - for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) { - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id); - tt::Cluster::instance().read_sysmem(readback_data.data(), byte_size, address, mmio_device_id, channel); - EXPECT_EQ(readback_data, golden_data.at(device_id)); - } -} -} - - - - -} // namespace host_tests diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp deleted file mode 100644 index e4eceaffb9c..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp +++ /dev/null @@ -1,348 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "command_queue_fixture.hpp" -#include "command_queue_test_utils.hpp" -#include "gtest/gtest.h" -#include "impl/buffers/buffer.hpp" -#include "tt_metal/common/bfloat16.hpp" -#include "tt_metal/common/scoped_timer.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/util.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/impl/device/device.hpp" -#include "tt_metal/impl/dispatch/command_queue.hpp" -#include "tt_metal/impl/buffers/circular_buffer.hpp" - -using std::vector; -using namespace tt::tt_metal; - -namespace host_cq_test_utils { -// Utility functions for Async Queue Flatten stress test -// Add more utils here for testing other ops/workloads -uint32_t prod(vector &shape) { - uint32_t shape_prod = 1; - - for (uint32_t shape_i: shape) { - shape_prod *= shape_i; - } - - return shape_prod; -} - -inline std::vector gold_standard_flatten(std::vector src_vec, vector shape) { - - int numel_in_tensor = prod(shape) / 2; - int idx = 0; - std::vector expected_dst_vec; - - uint32_t num_tile_rows = shape.at(shape.size() - 2) / 32; - uint32_t num_tile_cols = shape.at(shape.size() - 1) / 32; - - uint32_t start_dram_addr_offset_for_tensor_row = 0; - - for (int i = 0; i < num_tile_rows; i++) { - for (uint32_t j = 0; j < 32; j++) { - uint32_t src_addr_ = start_dram_addr_offset_for_tensor_row; - for (uint32_t k = 0; k < num_tile_cols; k++) { - - // Copy a row - for (uint32_t l = 0; l < 16; l++) { - uint32_t src_addr = src_addr_ + l; - expected_dst_vec.push_back(src_vec.at(src_addr_ + l)); - } - - // Zero padding - for (uint32_t l = 0; l < 31 * 16; l++) { - expected_dst_vec.push_back(0); - } - src_addr_ += 32 * 16; - } - start_dram_addr_offset_for_tensor_row += 16; - } - start_dram_addr_offset_for_tensor_row += num_tile_cols * 16; - } - - TT_FATAL(expected_dst_vec.size() == (num_tile_rows * 32) * (num_tile_cols * 16) * 32, "Error"); - return expected_dst_vec; -} - -bool flatten(Device *device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) { - // Test Simulating Program Caching with Async Command Queues - bool pass = true; - // Create a program used across all loops - Program program = CreateProgram(); - - CoreCoord core = {0, 0}; - - uint32_t single_tile_size = 2 * 1024; - - uint32_t num_tiles = num_tiles_r * num_tiles_c; - uint32_t num_bytes_per_tensor_row = num_tiles_c * 64; - uint32_t num_bytes_per_tile = num_tiles * single_tile_size; - - uint32_t dram_buffer_size = single_tile_size * num_tiles * 32; - - - InterleavedBufferConfig dram_config{ - .device=device, - .size = dram_buffer_size, - .page_size = dram_buffer_size, - .buffer_type = BufferType::DRAM - }; - uint32_t src0_cb_index = 0; - uint32_t num_input_tiles = 8; - CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(src0_cb_index, single_tile_size); - auto cb_src0 = CreateCircularBuffer(program, core, cb_src0_config); - - uint32_t ouput_cb_index = 16; - uint32_t num_output_tiles = 1; - CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(ouput_cb_index, single_tile_size); - auto cb_output = CreateCircularBuffer(program, core, cb_output_config); - - auto flatten_kernel = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp", - core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); - - auto unary_writer_kernel = CreateKernel( - program, - "tt_metal/kernels/dataflow/writer_unary.cpp", - core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - vector compute_kernel_args = { - num_tiles * 32 - }; - - auto eltwise_unary_kernel = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", - core, - ComputeConfig{.compile_args = compute_kernel_args} - ); - - // Inside the loop, run async runtime functions - for (int i = 0; i < 1000; i++) { - // Create Device Buffers Asynchronously - auto src_dram_buffer = CreateBuffer(dram_config); - auto dst_dram_buffer = CreateBuffer(dram_config); - - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // Create the source vector - std::shared_ptr> src_vec = std::make_shared>(create_random_vector_of_bfloat16( - dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count())); - - std::vector golden = gold_standard_flatten(*src_vec, {num_tiles_r * 32, num_tiles_c * 32}); - // Set the runtime args asynchronously - std::shared_ptr writer_runtime_args = std::make_shared(); - std::shared_ptr compute_runtime_args = std::make_shared(); - *compute_runtime_args = { - src_dram_buffer.get(), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - num_tiles_r, - num_tiles_c, - num_bytes_per_tensor_row - }; - *writer_runtime_args = { - dst_dram_buffer.get(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tiles * 32 - }; - - SetRuntimeArgs( - device, - detail::GetKernel(program, flatten_kernel), - core, - compute_runtime_args); - - SetRuntimeArgs( - device, - detail::GetKernel(program, unary_writer_kernel), - core, - writer_runtime_args); - // Async write input - EnqueueWriteBuffer(device->command_queue(), src_dram_buffer, src_vec, false); - // Share ownership of buffer with program - AssignGlobalBufferToProgram(src_dram_buffer, program); - // Main thread gives up ownership of buffer and src data (this is what python does) - src_dram_buffer.reset(); - src_vec.reset(); - // Queue up program - EnqueueProgram(device->command_queue(), program, false); - // Blocking read - std::vector result_vec; - EnqueueReadBuffer(device->command_queue(), dst_dram_buffer, result_vec, true); - - // Validation of data - TT_FATAL(golden.size() == result_vec.size(), "Error"); - pass &= (golden == result_vec); - - if (not pass) { - std::cout << "GOLDEN" << std::endl; - print_vec_of_uint32_as_packed_bfloat16(golden, num_tiles * 32); - - std::cout << "RESULT" << std::endl; - print_vec_of_uint32_as_packed_bfloat16(result_vec, num_tiles * 32); - } - } - return pass; -} -} - -namespace host_command_queue_tests { - -TEST_F(CommandQueueFixture, TestAsyncCommandQueueSanityAndProfile) { - auto& command_queue = this->device_->command_queue(); - auto current_mode = CommandQueue::default_mode(); - command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); - Program program; - - CoreRange cr({0, 0}, {0, 0}); - CoreRangeSet cr_set({cr}); - // Add an NCRISC blank manually, but in compile program, the BRISC blank will be - // added separately - auto dummy_reader_kernel = CreateKernel( - program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); - // Use scoper timer to benchmark time for pushing 2 commands - { - tt::ScopedTimer timer("AsyncCommandQueue"); - EnqueueProgram(command_queue, program, false); - Finish(command_queue); - } - command_queue.set_mode(current_mode); -} - -TEST_F(CommandQueueFixture, DISABLED_TestAsyncBufferRW) { - // Test Async Enqueue Read and Write + Get Addr + Buffer Allocation and Deallocation - auto& command_queue = this->device_->command_queue(); - auto current_mode = CommandQueue::default_mode(); - command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); - Program program; - for (int j = 0; j < 10; j++) { - // Asynchronously initialize a buffer on device - uint32_t first_buf_value = j + 1; - uint32_t second_buf_value = j + 2; - uint32_t first_buf_size = 4096; - uint32_t second_buf_size = 2048; - // Asynchronously allocate buffer on device - std::shared_ptr buffer = Buffer::create(this->device_, first_buf_size, first_buf_size, BufferType::DRAM); - std::shared_ptr allocated_buffer_address = std::make_shared(); - EnqueueGetBufferAddr(this->device_->command_queue(), allocated_buffer_address.get(), buffer.get(), true); - // Ensure returned addr is correct - EXPECT_EQ((*allocated_buffer_address), buffer->address()); - - std::shared_ptr> vec = std::make_shared>(first_buf_size / 4, first_buf_value); - std::vector readback_vec = {}; - // Write first vector to existing on device buffer. - EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false); - // Reallocate the vector in the main thread after asynchronously pushing it (ensure that worker still has access to this data) - vec = std::make_shared>(second_buf_size / 4, second_buf_value); - // Simulate what tt-eager does: Share buffer ownership with program - AssignGlobalBufferToProgram(buffer, program); - // Reallocate buffer (this is safe, since the program also owns the existing buffer, which will not be deallocated) - buffer = Buffer::create(this->device_, second_buf_size, second_buf_size, BufferType::DRAM); - // Write second vector to second buffer - EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false); - // Have main thread give up ownership immediately after writing - vec.reset(); - // Read both buffer and ensure data is correct - EnqueueReadBuffer(this->device_->command_queue(), buffer, readback_vec, true); - for (int i = 0; i < readback_vec.size(); i++) { - EXPECT_EQ(readback_vec[i], second_buf_value); - } - } - command_queue.set_mode(current_mode); -} - -TEST_F(CommandQueueFixture, DISABLED_TestAsyncCBAllocation) { - // Test asynchronous allocation of buffers and their assignment to CBs - auto& command_queue = this->device_->command_queue(); - auto current_mode = CommandQueue::default_mode(); - command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); - Program program; - - const uint32_t num_pages = 1; - const uint32_t page_size = detail::TileSize(tt::DataFormat::Float16_b); - const tt::DataFormat data_format = tt::DataFormat::Float16_b; - - auto buffer_size = page_size; - tt::tt_metal::InterleavedBufferConfig buff_config{ - .device=this->device_, - .size = buffer_size, - .page_size = buffer_size, - .buffer_type = tt::tt_metal::BufferType::L1 - }; - // Asynchronously allocate an L1 Buffer - auto l1_buffer = CreateBuffer(buff_config); - CoreRange cr({0, 0}, {0, 2}); - CoreRangeSet cr_set({cr}); - std::vector buffer_indices = {16, 24}; - - CircularBufferConfig config1 = CircularBufferConfig(page_size, {{buffer_indices[0], data_format}, {buffer_indices[1], data_format}}, *l1_buffer) - .set_page_size(buffer_indices[0], page_size) - .set_page_size(buffer_indices[1], page_size); - // Asynchronously assign the L1 Buffer to the CB - auto multi_core_cb = CreateCircularBuffer(program, cr_set, config1); - auto cb_ptr = detail::GetCircularBuffer(program, multi_core_cb); - Finish(this->device_->command_queue()); - // Addresses should match - EXPECT_EQ(cb_ptr->address(), l1_buffer->address()); - // Asynchronously allocate a new L1 buffer - auto l1_buffer_2 = CreateBuffer(buff_config); - // Asynchronously update CB address to match new L1 buffer - UpdateDynamicCircularBufferAddress(program, multi_core_cb, *l1_buffer_2); - Finish(this->device_->command_queue()); - // Addresses should match - EXPECT_EQ(cb_ptr->address(), l1_buffer_2->address()); - command_queue.set_mode(current_mode); -} - -TEST_F(CommandQueueFixture, DISABLED_TestAsyncAssertForDeprecatedAPI) { - auto& command_queue = this->device_->command_queue(); - auto current_mode = CommandQueue::default_mode(); - command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); - Program program; - CoreCoord core = {0, 0}; - uint32_t buf_size = 4096; - uint32_t page_size = 4096; - auto dummy_kernel = CreateKernel( - program, - "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp", - core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - auto src0 = Buffer::create(this->device_, buf_size, page_size, BufferType::DRAM); - std::vector runtime_args = {src0->address()}; - try { - SetRuntimeArgs(program, dummy_kernel, core, runtime_args); - } - catch (std::runtime_error &e) { - std::string expected = "This variant of SetRuntimeArgs can only be called when Asynchronous SW Command Queues are disabled for Fast Dispatch."; - const string error = string(e.what()); - EXPECT_TRUE(error.find(expected) != std::string::npos); - } - command_queue.set_mode(current_mode); -} - -TEST_F(CommandQueueFixture, DISABLED_TestAsyncFlattenStress){ - auto& command_queue = this->device_->command_queue(); - auto current_mode = CommandQueue::default_mode(); - command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); - uint32_t num_tiles_r = 2; - uint32_t num_tiles_c = 2; - if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ - num_tiles_r = 1; - num_tiles_c = 1; - } - ASSERT_TRUE(host_cq_test_utils::flatten(this->device_, num_tiles_r, num_tiles_c)); - command_queue.set_mode(current_mode); -} -} diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_device_pool.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_device_pool.cpp deleted file mode 100644 index 85af4f9396f..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_device_pool.cpp +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" -#include "tt_metal/test_utils/stimulus.hpp" -#include "tt_metal/impl/device/device_pool.hpp" - -using namespace tt; -using namespace tt::test_utils; - -TEST_F(FDBasicFixture, DevicePoolOpenClose) { - std::vector device_ids{0}; - int num_hw_cqs = 1; - int l1_small_size = 1024; - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - auto devices = tt::DevicePool::instance().get_all_active_devices(); - for (const auto& dev: devices) { - ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); - ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); - ASSERT_TRUE(dev->is_initialized()); - } - - // Close then get devices again - for (const auto& dev: devices) { - dev->close(); - } - devices = tt::DevicePool::instance().get_all_active_devices(); - for (const auto& dev: devices) { - ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); - ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); - ASSERT_TRUE(dev->is_initialized()); - } - for (const auto& dev: devices) { - dev->close(); - } -} - -TEST_F(FDBasicFixture, DevicePoolReconfigDevices) { - std::vector device_ids{0}; - int num_hw_cqs = 1; - int l1_small_size = 1024; - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - auto devices = tt::DevicePool::instance().get_all_active_devices(); - for (const auto& dev: devices) { - ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); - ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); - ASSERT_TRUE(dev->is_initialized()); - } - - // Close then get devices with different configs - for (const auto& dev: devices) { - dev->close(); - } - l1_small_size = 2048; - tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - devices = tt::DevicePool::instance().get_all_active_devices(); - for (const auto& dev: devices) { - ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); - ASSERT_TRUE(dev->is_initialized()); - } - for (const auto& dev: devices) { - dev->close(); - } -} - -TEST_F(FDBasicFixture, DevicePoolAddDevices) { - if (tt::tt_metal::GetNumAvailableDevices() != 8) { - GTEST_SKIP(); - } - std::vector device_ids{0}; - int num_hw_cqs = 1; - int l1_small_size = 1024; - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - auto devices = tt::DevicePool::instance().get_all_active_devices(); - for (const auto& dev: devices) { - ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); - ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); - ASSERT_TRUE(dev->is_initialized()); - } - - // Close then get more devices - for (const auto& dev: devices) { - dev->close(); - } - device_ids = {0, 1, 2, 3}; - tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - devices = tt::DevicePool::instance().get_all_active_devices(); - ASSERT_TRUE(devices.size() >= 4); - for (const auto& dev: devices) { - ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); - ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); - ASSERT_TRUE(dev->is_initialized()); - } - for (const auto& dev: devices) { - dev->close(); - } -} - -TEST_F(FDBasicFixture, DevicePoolReduceDevices) { - if (tt::tt_metal::GetNumAvailableDevices() != 8) { - GTEST_SKIP(); - } - std::vector device_ids{0, 1, 2, 3}; - int num_hw_cqs = 1; - int l1_small_size = 1024; - const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type(); - tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - const auto devices = tt::DevicePool::instance().get_all_active_devices(); - for (const auto& dev: devices) { - ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); - ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); - ASSERT_TRUE(dev->is_initialized()); - } - - // Close then get less devices - for (const auto& dev: devices) { - dev->close(); - } - device_ids = {0}; - tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - auto dev = tt::DevicePool::instance().get_active_device(0); - ASSERT_TRUE(dev->id() == 0); - ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size); - ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs); - ASSERT_TRUE(dev->is_initialized()); - tt::DevicePool::instance().close_device(0); -} diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp deleted file mode 100644 index ef05c731489..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp +++ /dev/null @@ -1,735 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include -#include -#include - -#include "command_queue_fixture.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/impl/kernels/kernel.hpp" -#include "tt_metal/impl/buffers/buffer.hpp" -#include "tt_metal/impl/device/device.hpp" - -#include "tt_metal/test_utils/comparison.hpp" -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" -#include "tt_metal/test_utils/stimulus.hpp" - - -using std::vector; -using namespace tt; -using namespace tt::test_utils; -using namespace tt::test_utils::df; - -namespace { -namespace CMAKE_UNIQUE_NAMESPACE { -constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet -constexpr std::int32_t MAX_NUM_WORDS = - (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE; -constexpr std::int32_t MAX_BUFFER_SIZE = - (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); - -struct BankedConfig { - size_t num_pages = 1; - size_t size_bytes = 1 * 2 * 32 * 32; - size_t page_size_bytes = 2 * 32 * 32; - tt_metal::BufferType input_buffer_type = tt_metal::BufferType::L1; - tt_metal::BufferType output_buffer_type = tt_metal::BufferType::L1; - tt::DataFormat l1_data_format = tt::DataFormat::Float16_b; -}; -} -} - -namespace fd_unit_tests::erisc::kernels { - -const size_t get_rand_32_byte_aligned_address(const size_t& base, const size_t& max) { - TT_ASSERT(!(base & 0x1F) and !(max & 0x1F)); - size_t word_size = (max >> 5) - (base >> 5); - return (((rand() % word_size) << 5) + base); -} - -bool test_dummy_EnqueueProgram_with_runtime_args(Device* device, const CoreCoord& eth_core_coord) { - Program program; - bool pass = true; - auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_core_coord); - - constexpr uint32_t num_runtime_args0 = 9; - constexpr uint32_t rta_base0 = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - std::map dummy_defines0 = {{"DATA_MOVEMENT", "1"}, - {"NUM_RUNTIME_ARGS", std::to_string(num_runtime_args0)}, - {"RESULTS_ADDR", std::to_string(rta_base0)}}; - auto dummy_kernel0 = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/misc/runtime_args_kernel.cpp", - eth_core_coord, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .defines = dummy_defines0}); - - vector dummy_kernel0_args = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - tt::tt_metal::SetRuntimeArgs(program, dummy_kernel0, eth_core_coord, dummy_kernel0_args); - - tt::tt_metal::detail::CompileProgram(device, program); - auto& cq = device->command_queue(); - EnqueueProgram(cq, program, false); - Finish(cq); - - vector dummy_kernel0_args_readback = llrt::read_hex_vec_from_core( - device->id(), - eth_noc_xy, - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, - dummy_kernel0_args.size() * sizeof(uint32_t)); - - pass &= (dummy_kernel0_args == dummy_kernel0_args_readback); - - return pass; -} - -bool reader_kernel_no_send( - tt_metal::Device* device, - const size_t& byte_size, - const size_t& eth_l1_byte_address, - const CoreCoord& eth_reader_core) { - bool pass = true; - //////////////////////////////////////////////////////////////////////////// - // Application Setup - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program program = tt_metal::Program(); - - tt::tt_metal::InterleavedBufferConfig dram_config{ - .device = device, .size = byte_size, .page_size = byte_size, .buffer_type = tt::tt_metal::BufferType::DRAM}; - - auto input_dram_buffer = CreateBuffer(dram_config); - uint32_t dram_byte_address = input_dram_buffer->address(); - auto dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_reader_core); - log_debug( - tt::LogTest, - "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}", - device->id(), - byte_size, - dram_noc_xy.str(), - dram_byte_address, - eth_reader_core.str(), - eth_l1_byte_address); - - auto eth_reader_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp", - eth_reader_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}); - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - - auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); - tt_metal::detail::WriteToBuffer(input_dram_buffer, inputs); - - // Clear expected value at ethernet L1 address - std::vector all_zeros(inputs.size(), 0); - llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_l1_byte_address); - - tt_metal::SetRuntimeArgs( - program, - eth_reader_kernel, - eth_reader_core, - { - (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, - (uint32_t)byte_size, - (uint32_t)eth_l1_byte_address, - }); - - auto& cq = device->command_queue(); - tt::tt_metal::detail::CompileProgram(device, program); - EnqueueProgram(cq, program, false); - Finish(cq); - - auto readback_vec = llrt::read_hex_vec_from_core(device->id(), eth_noc_xy, eth_l1_byte_address, byte_size); - pass &= (readback_vec == inputs); - if (not pass) { - std::cout << "Mismatch at Core: " << eth_noc_xy.str() << std::endl; - } - return pass; -} - -bool writer_kernel_no_receive( - tt_metal::Device* device, - const size_t& byte_size, - const size_t& eth_l1_byte_address, - const CoreCoord& eth_writer_core) { - bool pass = true; - //////////////////////////////////////////////////////////////////////////// - // Application Setup - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program program = tt_metal::Program(); - - tt::tt_metal::InterleavedBufferConfig dram_config{ - .device = device, .size = byte_size, .page_size = byte_size, .buffer_type = tt::tt_metal::BufferType::DRAM}; - - auto output_dram_buffer = CreateBuffer(dram_config); - uint32_t dram_byte_address = output_dram_buffer->address(); - auto dram_noc_xy = output_dram_buffer->noc_coordinates(); - auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_writer_core); - log_debug( - tt::LogTest, - "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}", - device->id(), - byte_size, - eth_writer_core.str(), - eth_l1_byte_address, - dram_noc_xy.str(), - dram_byte_address); - - auto eth_writer_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp", - eth_writer_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}); - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - - auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); - llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, inputs, eth_l1_byte_address); - - // Clear expected value at ethernet L1 address - std::vector all_zeros(inputs.size(), 0); - tt_metal::detail::WriteToBuffer(output_dram_buffer, all_zeros); - - tt_metal::SetRuntimeArgs( - program, - eth_writer_kernel, - eth_writer_core, - { - (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, - (uint32_t)byte_size, - (uint32_t)eth_l1_byte_address, - }); - - auto& cq = device->command_queue(); - tt::tt_metal::detail::CompileProgram(device, program); - EnqueueProgram(cq, program, false); - Finish(cq); - - auto readback_vec = llrt::read_hex_vec_from_core(device->id(), dram_noc_xy, dram_byte_address, byte_size); - pass &= (readback_vec == inputs); - if (not pass) { - std::cout << "Mismatch at Core: " << dram_noc_xy.str() << std::endl; - } - return pass; -} - -bool eth_direct_sender_receiver_kernels( - tt_metal::Device* sender_device, - tt_metal::Device* receiver_device, - const size_t& byte_size, - const size_t& src_eth_l1_byte_address, - const size_t& dst_eth_l1_byte_address, - const CoreCoord& eth_sender_core, - const CoreCoord& eth_receiver_core, - uint32_t num_bytes_per_send = 16) { - bool pass = true; - log_debug( - tt::LogTest, - "Sending {} bytes from device {} eth core {} addr {} to device {} eth core {} addr {}", - byte_size, - sender_device->id(), - eth_sender_core.str(), - src_eth_l1_byte_address, - receiver_device->id(), - eth_receiver_core.str(), - dst_eth_l1_byte_address); - // Generate inputs - auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); - llrt::write_hex_vec_to_core( - sender_device->id(), - sender_device->ethernet_core_from_logical_core(eth_sender_core), - inputs, - src_eth_l1_byte_address); - - // Clear expected value at ethernet L1 address - std::vector all_zeros(inputs.size(), 0); - llrt::write_hex_vec_to_core( - receiver_device->id(), - receiver_device->ethernet_core_from_logical_core(eth_receiver_core), - all_zeros, - dst_eth_l1_byte_address); - - //////////////////////////////////////////////////////////////////////////// - // Sender Device - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program sender_program = tt_metal::Program(); - - auto eth_sender_kernel = tt_metal::CreateKernel( - sender_program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_send.cpp", - eth_sender_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, - .compile_args = {uint32_t(num_bytes_per_send), uint32_t(num_bytes_per_send >> 4)}}); - - tt_metal::SetRuntimeArgs( - sender_program, - eth_sender_kernel, - eth_sender_core, - { - (uint32_t)src_eth_l1_byte_address, - (uint32_t)dst_eth_l1_byte_address, - (uint32_t)byte_size, - }); - - //////////////////////////////////////////////////////////////////////////// - // Receiver Device - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program receiver_program = tt_metal::Program(); - - auto eth_receiver_kernel = tt_metal::CreateKernel( - receiver_program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_receive.cpp", - eth_receiver_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}); // probably want to use NOC_1 here - - tt_metal::SetRuntimeArgs( - receiver_program, - eth_receiver_kernel, - eth_receiver_core, - { - (uint32_t)byte_size, - }); - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - - tt::tt_metal::detail::CompileProgram(sender_device, sender_program); - tt::tt_metal::detail::CompileProgram(receiver_device, receiver_program); - - EnqueueProgram(sender_device->command_queue(), sender_program, false); - EnqueueProgram(receiver_device->command_queue(), receiver_program, false); - Finish(sender_device->command_queue()); - Finish(receiver_device->command_queue()); - - auto readback_vec = llrt::read_hex_vec_from_core( - receiver_device->id(), - receiver_device->ethernet_core_from_logical_core(eth_receiver_core), - dst_eth_l1_byte_address, - byte_size); - pass &= (readback_vec == inputs); - if (not pass) { - std::cout << "Mismatch at Core: " << eth_receiver_core.str() << std::endl; - std::cout << readback_vec[0] << std::endl; - } - return pass; -} - -bool chip_to_chip_dram_buffer_transfer( - tt_metal::Device* sender_device, - tt_metal::Device* receiver_device, - const CoreCoord& eth_sender_core, - const CoreCoord& eth_receiver_core, - const size_t& byte_size) { - bool pass = true; - - tt::tt_metal::InterleavedBufferConfig sender_dram_config{ - .device = sender_device, - .size = byte_size, - .page_size = byte_size, - .buffer_type = tt::tt_metal::BufferType::DRAM}; - tt::tt_metal::InterleavedBufferConfig receiver_dram_config{ - .device = receiver_device, - .size = byte_size, - .page_size = byte_size, - .buffer_type = tt::tt_metal::BufferType::DRAM}; - - // Create source buffer on sender device - auto input_dram_buffer = CreateBuffer(sender_dram_config); - uint32_t input_dram_byte_address = input_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - - // Create dest buffer on receiver device - auto output_dram_buffer = CreateBuffer(receiver_dram_config); - uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - - // Generate inputs - auto inputs = generate_uniform_random_vector(0, 100, byte_size / sizeof(uint32_t)); - - tt_metal::detail::WriteToBuffer(input_dram_buffer, inputs); - - const uint32_t MAX_BUFFER = - (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); - uint32_t num_loops = (uint32_t)(byte_size / MAX_BUFFER); - uint32_t remaining_bytes = (uint32_t)(byte_size % MAX_BUFFER); - // Clear expected value at ethernet L1 address - std::vector all_zeros(inputs.size(), 0); - - tt_metal::detail::WriteToBuffer(output_dram_buffer, all_zeros); - - //////////////////////////////////////////////////////////////////////////// - // Sender Device - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program sender_program = tt_metal::Program(); - - auto eth_sender_kernel = tt_metal::CreateKernel( - sender_program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_sender.cpp", - eth_sender_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}); - - tt_metal::SetRuntimeArgs( - sender_program, - eth_sender_kernel, - eth_sender_core, - { - (uint32_t)input_dram_byte_address, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, - (uint32_t)remaining_bytes, - (uint32_t)num_loops, - (uint32_t)MAX_BUFFER, - }); - - //////////////////////////////////////////////////////////////////////////// - // Receiver Device - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program receiver_program = tt_metal::Program(); - - auto eth_receiver_kernel = tt_metal::CreateKernel( - receiver_program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_receiver.cpp", - eth_receiver_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}); // probably want to use NOC_1 here - - tt_metal::SetRuntimeArgs( - receiver_program, - eth_receiver_kernel, - eth_receiver_core, - { - (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, - (uint32_t)remaining_bytes, - (uint32_t)num_loops, - (uint32_t)MAX_BUFFER, - }); - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - - tt::tt_metal::detail::CompileProgram(sender_device, sender_program); - tt::tt_metal::detail::CompileProgram(receiver_device, receiver_program); - - EnqueueProgram(sender_device->command_queue(), sender_program, false); - EnqueueProgram(receiver_device->command_queue(), receiver_program, false); - Finish(sender_device->command_queue()); - Finish(receiver_device->command_queue()); - - std::vector dest_dram_data; - tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_dram_data); - pass &= (dest_dram_data == inputs); - if (not pass) { - std::cout << "Mismatch at Core: " << output_dram_noc_xy.str() << std::endl; - std::cout << dest_dram_data[0] << std::endl; - } - return pass; -} - -bool chip_to_chip_interleaved_buffer_transfer( - tt_metal::Device* sender_device, - tt_metal::Device* receiver_device, - const CoreCoord& eth_sender_core, - const CoreCoord& eth_receiver_core, - const CMAKE_UNIQUE_NAMESPACE::BankedConfig& cfg, - const uint32_t& max_transfer_size) { - bool pass = true; - - const uint32_t input0_cb_index = 0; - const uint32_t output_cb_index = 16; - - TT_FATAL(cfg.num_pages * cfg.page_size_bytes == cfg.size_bytes, "Error"); - constexpr uint32_t num_pages_cb = 1; - - //////////////////////////////////////////////////////////////////////////// - // Sender Device - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program sender_program = tt_metal::Program(); - - auto input_packed = generate_uniform_random_vector(0, 100, cfg.size_bytes / sizeof(uint32_t)); - - tt::tt_metal::InterleavedBufferConfig sender_config{ - .device = sender_device, - .size = cfg.size_bytes, - .page_size = cfg.page_size_bytes, - .buffer_type = cfg.input_buffer_type}; - tt::tt_metal::InterleavedBufferConfig receiver_config{ - .device = receiver_device, - .size = cfg.size_bytes, - .page_size = cfg.page_size_bytes, - .buffer_type = cfg.output_buffer_type}; - auto input_buffer = CreateBuffer(sender_config); - bool input_is_dram = cfg.input_buffer_type == BufferType::DRAM; - - tt_metal::detail::WriteToBuffer(input_buffer, input_packed); - - const uint32_t max_buffer = round_down(max_transfer_size, cfg.page_size_bytes); - uint32_t pages_per_loop = max_buffer / cfg.page_size_bytes; - uint32_t num_loops = (uint32_t)(cfg.size_bytes / max_buffer); - uint32_t remaining_bytes = (uint32_t)(cfg.size_bytes % max_buffer); - uint32_t remaining_pages = remaining_bytes / cfg.page_size_bytes; - - auto eth_sender_kernel = tt_metal::CreateKernel( - sender_program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_buffer_to_buffer_sender.cpp", - eth_sender_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .compile_args = {(uint32_t)input_is_dram}}); - - tt_metal::SetRuntimeArgs( - sender_program, - eth_sender_kernel, - eth_sender_core, - {(uint32_t)input_buffer->address(), - (uint32_t)cfg.page_size_bytes, - (uint32_t)max_buffer, - (uint32_t)num_loops, - (uint32_t)pages_per_loop, - (uint32_t)remaining_bytes, - (uint32_t)remaining_pages}); - - //////////////////////////////////////////////////////////////////////////// - // Receiver Device - //////////////////////////////////////////////////////////////////////////// - tt_metal::Program receiver_program = tt_metal::Program(); - - auto output_buffer = CreateBuffer(receiver_config); - bool output_is_dram = cfg.output_buffer_type == BufferType::DRAM; - std::vector all_zeros(cfg.size_bytes / sizeof(uint32_t), 0); - - tt_metal::detail::WriteToBuffer(output_buffer, all_zeros); - - auto eth_receiver_kernel = tt_metal::CreateKernel( - receiver_program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_buffer_to_buffer_receiver.cpp", - eth_receiver_core, - tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_1, .compile_args = {(uint32_t)output_is_dram}}); - - tt_metal::SetRuntimeArgs( - receiver_program, - eth_receiver_kernel, - eth_receiver_core, - { - (uint32_t)output_buffer->address(), - (uint32_t)cfg.page_size_bytes, - (uint32_t)max_buffer, - (uint32_t)num_loops, - (uint32_t)pages_per_loop, - (uint32_t)remaining_bytes, - (uint32_t)remaining_pages, - }); - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - - tt::tt_metal::detail::CompileProgram(sender_device, sender_program); - tt::tt_metal::detail::CompileProgram(receiver_device, receiver_program); - - EnqueueProgram(sender_device->command_queue(), sender_program, false); - EnqueueProgram(receiver_device->command_queue(), receiver_program, false); - Finish(sender_device->command_queue()); - Finish(receiver_device->command_queue()); - - std::vector dest_buffer_data; - tt_metal::detail::ReadFromBuffer(output_buffer, dest_buffer_data); - pass &= input_packed == dest_buffer_data; - return pass; -} -} // namespace fd_unit_tests::erisc::kernels - -TEST_F(CommandQueueSingleCardFixture, EnqueueDummyProgramOnEthCore) { - for (const auto& device : devices_) { - for (const auto& eth_core : device->get_active_ethernet_cores(true)) { - ASSERT_TRUE(fd_unit_tests::erisc::kernels::test_dummy_EnqueueProgram_with_runtime_args(device, eth_core)); - } - } -} - -TEST_F(CommandQueueSingleCardFixture, EthKernelsNocReadNoSend) { - using namespace CMAKE_UNIQUE_NAMESPACE; - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - - for (const auto& device : devices_) { - for (const auto& eth_core : device->get_active_ethernet_cores(true)) { - ASSERT_TRUE(fd_unit_tests::erisc::kernels::reader_kernel_no_send( - device, WORD_SIZE, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::reader_kernel_no_send( - device, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::reader_kernel_no_send( - device, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); - } - } -} - -TEST_F(CommandQueueSingleCardFixture, EthKernelsNocWriteNoReceive) { - using namespace CMAKE_UNIQUE_NAMESPACE; - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - - for (const auto& device : devices_) { - for (const auto& eth_core : device->get_active_ethernet_cores(true)) { - ASSERT_TRUE(fd_unit_tests::erisc::kernels::writer_kernel_no_receive( - device, WORD_SIZE, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::writer_kernel_no_receive( - device, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::writer_kernel_no_receive( - device, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core)); - } - } -} - -TEST_F(CommandQueueMultiDeviceFixture, EthKernelsDirectSendAllConnectedChips) { - using namespace CMAKE_UNIQUE_NAMESPACE; - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - for (const auto& sender_device : devices_) { - for (const auto& receiver_device : devices_) { - if (sender_device->id() >= receiver_device->id()) { - continue; - } - for (const auto& sender_core : sender_device->get_active_ethernet_cores(true)) { - auto [device_id, receiver_core] = sender_device->get_connected_ethernet_core(sender_core); - if (receiver_device->id() != device_id) { - continue; - } - ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - sender_device, - receiver_device, - WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - sender_device, - receiver_device, - 4 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - sender_device, - receiver_device, - 256 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels( - sender_device, - receiver_device, - 1000 * WORD_SIZE, - src_eth_l1_byte_address, - dst_eth_l1_byte_address, - sender_core, - receiver_core)); - } - } - } -} - -TEST_F(CommandQueueMultiDeviceFixture, EthKernelsSendDramBufferAllConnectedChips) { - for (const auto& sender_device : devices_) { - for (const auto& receiver_device : devices_) { - if (sender_device->id() >= receiver_device->id()) { - continue; - } - for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) { - auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core); - if (receiver_device->id() != device_id) { - continue; - } - log_info( - tt::LogTest, - "Sending dram buffer from device {} to device {}, using eth core {} and {}", - sender_device->id(), - receiver_device->id(), - sender_eth_core.str(), - receiver_eth_core.str()); - - ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024)); - } - } - } -} - -TEST_F(CommandQueueMultiDeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) { - using namespace CMAKE_UNIQUE_NAMESPACE; - for (const auto& sender_device : devices_) { - for (const auto& receiver_device : devices_) { - if (sender_device->id() >= receiver_device->id()) { - continue; - } - for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) { - auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core); - if (receiver_device->id() != device_id) { - continue; - } - - log_info( - tt::LogTest, - "Sending interleaved buffer from device {} to device {}, using eth core {} and {}", - sender_device->id(), - receiver_device->id(), - sender_eth_core.str(), - receiver_eth_core.str()); - BankedConfig test_config = BankedConfig{ - .num_pages = 200, - .size_bytes = 200 * 2 * 32 * 32, - .page_size_bytes = 2 * 32 * 32, - .input_buffer_type = BufferType::L1, - .output_buffer_type = BufferType::DRAM}; - - ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( - sender_device, - receiver_device, - sender_eth_core, - receiver_eth_core, - test_config, - test_config.page_size_bytes)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); - test_config = BankedConfig{ - .num_pages = 200, - .size_bytes = 200 * 2 * 32 * 32, - .page_size_bytes = 2 * 32 * 32, - .input_buffer_type = BufferType::DRAM, - .output_buffer_type = BufferType::L1}; - ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( - sender_device, - receiver_device, - sender_eth_core, - receiver_eth_core, - test_config, - test_config.page_size_bytes)); - ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer( - sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE)); - } - } - } -} diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp deleted file mode 100644 index 3f0441da418..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp +++ /dev/null @@ -1,495 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include -#include -#include - -#include "command_queue_fixture.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/impl/device/device.hpp" -#include "tt_metal/impl/kernels/kernel.hpp" -#include "tt_metal/impl/buffers/buffer.hpp" -#include "tt_metal/test_utils/comparison.hpp" -#include "tt_metal/test_utils/df/df.hpp" -#include "tt_metal/test_utils/print_helpers.hpp" -#include "tt_metal/test_utils/stimulus.hpp" - -using std::vector; -using namespace tt; -using namespace tt::test_utils; -using namespace tt::test_utils::df; - -namespace { -namespace CMAKE_UNIQUE_NAMESPACE { -constexpr std::int32_t WORD_SIZE = 16; // 16 bytes per eth send packet -constexpr std::int32_t MAX_NUM_WORDS = - (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE; -constexpr std::int32_t MAX_BUFFER_SIZE = - (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE); - -struct BankedConfig { - size_t num_pages = 1; - size_t size_bytes = 1 * 2 * 32 * 32; - size_t page_size_bytes = 2 * 32 * 32; - tt_metal::BufferType input_buffer_type = tt_metal::BufferType::L1; - tt_metal::BufferType output_buffer_type = tt_metal::BufferType::L1; - tt::DataFormat l1_data_format = tt::DataFormat::Float16_b; -}; - -std::vector get_hamiltonian_cycle(vector>& adj, int N, int s = 0) { - std::vector> dp(N, std::vector(1 << N, -1)); - - for (int i = 0; i < N; ++i) { - if (adj[s][i]) { - dp[i][(1 << i)] = i; - } - } - - for (int i = 0; i < (1 << N); ++i) { - for (int j = 0; j < N; ++j) { - if (i & (1 << j)) { - for (int k = 0; k < N; ++k) { - if (i & (1 << k) && adj[k][j] && j != k && dp[k][i ^ (1 << j)] != -1) { - dp[j][i] = k; - break; - } - } - } - } - } - - for (int i = 0; i < N; ++i) { - int m = (1 << N) - 1; - - if (dp[i][m] != -1 && i == s) { - std::vector path; - path.reserve(N + 1); - path.push_back(i); - - for (int j = 0; j < N - 1; ++j) { - path.push_back(dp[*path.rbegin()][m]); - m ^= 1 << *(path.rbegin() + 1); - } - path.push_back(s); - return path; - } - } - return {}; -} - -std::vector get_device_ring(std::vector devices) { - std::vector> adj(devices.size(), std::vector(devices.size(), 0)); - for (uint32_t i = 0; i < devices.size(); ++i) { - const auto& device = devices[i]; - for (const auto& connected_device_id : device->get_ethernet_connected_device_ids()) { - for (uint32_t j = 0; j < devices.size(); ++j) { - if (devices[j]->id() == connected_device_id) { - adj[i][j] = 1; - } - } - } - } - - const auto& device_ring_idx = get_hamiltonian_cycle(adj, devices.size(), 0); - std::vector device_ring; - device_ring.reserve(device_ring_idx.size()); - for (const auto& device_idx : device_ring_idx) { - device_ring.push_back(devices[device_idx]); - } - return device_ring; -} - -std::vector> get_sender_receiver_cores( - std::vector device_ring) { - std::vector> sender_receivers; - sender_receivers.reserve(device_ring.size() - 1); - - // Special case for 2 devices to ensure core pairs are not the same for send and receive - if (device_ring.size() - 1 == 2) { - const auto& first_device = device_ring[0]; - const auto& second_device = device_ring[1]; - uint32_t i = 0; - for (const auto& first_eth_core : first_device->get_active_ethernet_cores(true)) { - auto [device_id, second_eth_core] = first_device->get_connected_ethernet_core(first_eth_core); - if (second_device->id() == device_id) { - tt_metal::Device *sender_device, *receiver_device; - CoreCoord sender_eth_core, receiver_eth_core; - if (i == 0) { - sender_device = first_device, receiver_device = second_device; - sender_eth_core = first_eth_core, receiver_eth_core = second_eth_core; - } else { - sender_device = second_device, receiver_device = first_device; - sender_eth_core = second_eth_core, receiver_eth_core = first_eth_core; - } - sender_receivers.push_back({sender_device, receiver_device, sender_eth_core, receiver_eth_core}); - log_info( - tt::LogTest, - "Sender: {} Receiver: {} Sender Eth: {} Receiver Eth: {}", - sender_device->id(), - receiver_device->id(), - sender_eth_core.str(), - receiver_eth_core.str()); - if (i > 0) { - break; - } - i++; - } - } - } else { - for (uint32_t i = 0; i < device_ring.size() - 1; ++i) { - const auto& sender_device = device_ring[i]; - const auto& receiver_device = device_ring[i + 1]; - for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) { - auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core); - if (receiver_device->id() == device_id) { - sender_receivers.push_back({sender_device, receiver_device, sender_eth_core, receiver_eth_core}); - log_info( - tt::LogTest, - "Sender: {} Receiver: {} Sender Eth: {} Receiver Eth: {}", - sender_device->id(), - receiver_device->id(), - sender_eth_core.str(), - receiver_eth_core.str()); - break; - } - } - } - } - return sender_receivers; -} -} -} - -namespace fd_unit_tests::erisc::kernels { - -bool eth_direct_ring_gather_sender_receiver_kernels( - std::vector device_ring, - const size_t& byte_size_per_device, - const size_t& src_eth_l1_byte_address, - const size_t& dst_eth_l1_byte_address, - const size_t& sem_l1_byte_address, - uint32_t num_bytes_per_send = 16) { - using namespace CMAKE_UNIQUE_NAMESPACE; - bool pass = true; - const auto& sender_receivers = get_sender_receiver_cores(device_ring); - - // Generate inputs - uint32_t numel = byte_size_per_device / sizeof(uint32_t); - std::vector> inputs; - inputs.reserve(sender_receivers.size()); - std::vector all_zeros(numel * sender_receivers.size(), 0); - std::map programs; - std::vector full_input; - full_input.reserve(numel * sender_receivers.size()); - - for (uint32_t i = 0; i < sender_receivers.size(); ++i) { - inputs.emplace_back( - generate_uniform_random_vector(0, 100, byte_size_per_device / sizeof(uint32_t), i)); - full_input.insert(full_input.begin() + i * numel, inputs[i].begin(), inputs[i].end()); - - //////////////////////////////////////////////////////////////////////////// - // Sender Device - //////////////////////////////////////////////////////////////////////////// - const auto& [sender_device, receiver_device, eth_sender_core, eth_receiver_core] = sender_receivers[i]; - auto& sender_program = programs[sender_device->id()]; - auto& receiver_program = programs[receiver_device->id()]; - CoreCoord sender_receiver_core; - for (uint32_t j = 0; j < sender_receivers.size(); ++j) { - if (std::get<1>(sender_receivers[j])->id() == sender_device->id()) { - sender_receiver_core = sender_device->ethernet_core_from_logical_core(std::get<3>(sender_receivers[j])); - } - } - auto eth_sender_kernel = tt_metal::CreateKernel( - sender_program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_ring_gather_send.cpp", - eth_sender_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, - .compile_args = { - uint32_t(num_bytes_per_send), - uint32_t(num_bytes_per_send >> 4), - uint32_t(sender_receiver_core.x), - uint32_t(sender_receiver_core.y)}}); - - tt_metal::SetRuntimeArgs( - sender_program, - eth_sender_kernel, - eth_sender_core, - {(uint32_t)(src_eth_l1_byte_address + (sender_receivers.size() - 1) * byte_size_per_device), - (uint32_t)dst_eth_l1_byte_address, - (uint32_t)byte_size_per_device, - (uint32_t)sender_receivers.size() - 1, - (uint32_t)(src_eth_l1_byte_address + i * byte_size_per_device), - (uint32_t)i, - (uint32_t)sem_l1_byte_address}); - - llrt::write_hex_vec_to_core( - sender_device->id(), - sender_device->ethernet_core_from_logical_core(eth_sender_core), - inputs[i], - src_eth_l1_byte_address + i * byte_size_per_device); - llrt::write_hex_vec_to_core( - sender_device->id(), - sender_device->ethernet_core_from_logical_core(eth_sender_core), - std::vector{INVALID}, - sem_l1_byte_address); - - //////////////////////////////////////////////////////////////////////////// - // Receiver Device - //////////////////////////////////////////////////////////////////////////// - // Clear expected value at ethernet L1 address - CoreCoord receiver_sender_core; - for (uint32_t j = 0; j < sender_receivers.size(); ++j) { - if (std::get<0>(sender_receivers[j])->id() == receiver_device->id()) { - receiver_sender_core = - receiver_device->ethernet_core_from_logical_core(std::get<2>(sender_receivers[j])); - } - } - - llrt::write_hex_vec_to_core( - receiver_device->id(), - receiver_device->ethernet_core_from_logical_core(eth_receiver_core), - all_zeros, - dst_eth_l1_byte_address); - llrt::write_hex_vec_to_core( - receiver_device->id(), - receiver_device->ethernet_core_from_logical_core(eth_receiver_core), - std::vector{INVALID}, - sem_l1_byte_address); - auto eth_receiver_kernel = tt_metal::CreateKernel( - receiver_program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_ring_gather_receive.cpp", - eth_receiver_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_1, - .compile_args = { - uint32_t(receiver_sender_core.x), - uint32_t(receiver_sender_core.y)}}); // probably want to use NOC_1 here - - tt_metal::SetRuntimeArgs( - receiver_program, - eth_receiver_kernel, - eth_receiver_core, - {(uint32_t)byte_size_per_device, (uint32_t)sender_receivers.size() - 1, (uint32_t)sem_l1_byte_address}); - } - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - - std::vector> cqs; - for (uint32_t i = 0; i < sender_receivers.size(); ++i) { - const auto& device = std::get<0>(sender_receivers[i]); - tt::tt_metal::detail::CompileProgram(device, programs.at(device->id())); - auto& cq = device->command_queue(); - - EnqueueProgram(cq, programs.at(device->id()), false); - cqs.emplace_back(cq); - } - for (auto& cq : cqs) { - Finish(cq); - } - - for (uint32_t i = 0; i < sender_receivers.size(); ++i) { - const auto& device = std::get<0>(sender_receivers[i]); - const auto& core = std::get<2>(sender_receivers[i]); - auto readback_vec = llrt::read_hex_vec_from_core( - device->id(), - device->ethernet_core_from_logical_core(core), - src_eth_l1_byte_address, - byte_size_per_device * sender_receivers.size()); - auto a = std::mismatch(full_input.begin(), full_input.end(), readback_vec.begin()); - bool p = (a.first == full_input.end()); - pass &= p; - if (not p) { - log_error(tt::LogTest, "Mismatch on Device {} at Core: {}", device->id(), core.str()); - log_error( - tt::LogTest, "Position: {} Expected: {} Read: {}", a.first - full_input.begin(), *a.first, *a.second); - } - } - - return pass; -} - -bool eth_interleaved_ring_gather_sender_receiver_kernels( - std::vector device_ring, - const CMAKE_UNIQUE_NAMESPACE::BankedConfig& cfg, - const size_t& src_eth_l1_byte_address, - const size_t& dst_eth_l1_byte_address, - const size_t& sem_l1_byte_address, - uint32_t num_bytes_per_send = 16) { - using namespace CMAKE_UNIQUE_NAMESPACE; - bool pass = true; - const auto& sender_receivers = get_sender_receiver_cores(device_ring); - - // Generate inputs - uint32_t numel = cfg.size_bytes / sizeof(uint32_t); - std::vector> inputs; - inputs.reserve(sender_receivers.size()); - std::vector all_zeros(numel * sender_receivers.size(), 0); - std::map programs; - std::vector full_input; - full_input.reserve(numel * sender_receivers.size()); - - std::vector> output_buffers; - output_buffers.reserve(sender_receivers.size()); - - for (uint32_t i = 0; i < sender_receivers.size(); ++i) { - inputs.emplace_back( - tt::test_utils::generate_packed_uniform_random_vector( - -1.0f, 1.0f, cfg.size_bytes / bfloat16::SIZEOF, i)); - full_input.insert(full_input.begin() + i * numel, inputs[i].begin(), inputs[i].end()); - - const auto& device = std::get<0>(sender_receivers[i]); - const auto& eth_sender_core = std::get<2>(sender_receivers[i]); - CoreCoord eth_receiver_core; - for (uint32_t j = 0; j < sender_receivers.size(); ++j) { - if (std::get<1>(sender_receivers[j])->id() == device->id()) { - eth_receiver_core = std::get<3>(sender_receivers[j]); - break; - } - } - - auto& program = programs[device->id()]; - - auto input_buffer = - CreateBuffer(InterleavedBufferConfig{device, cfg.size_bytes, cfg.page_size_bytes, cfg.input_buffer_type}); - bool input_is_dram = cfg.input_buffer_type == tt_metal::BufferType::DRAM; - tt_metal::detail::WriteToBuffer(input_buffer, inputs[i]); - output_buffers.emplace_back(CreateBuffer(InterleavedBufferConfig{ - device, cfg.size_bytes * sender_receivers.size(), cfg.page_size_bytes, cfg.output_buffer_type})); - tt_metal::detail::WriteToBuffer(output_buffers[i], all_zeros); - - auto eth_sender_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_eth_ring_gather_send.cpp", - eth_sender_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, - .compile_args = { - uint32_t(num_bytes_per_send), - uint32_t(num_bytes_per_send >> 4), - uint32_t(device->ethernet_core_from_logical_core(eth_receiver_core).x), - uint32_t(device->ethernet_core_from_logical_core(eth_receiver_core).y), - uint32_t(input_buffer->buffer_type() == tt_metal::BufferType::DRAM), - uint32_t(output_buffers[i]->buffer_type() == tt_metal::BufferType::DRAM)}}); - - tt_metal::SetRuntimeArgs( - program, - eth_sender_kernel, - eth_sender_core, - {(uint32_t)(src_eth_l1_byte_address), - (uint32_t)dst_eth_l1_byte_address, - (uint32_t)cfg.size_bytes + 32, // + 32 for idx - (uint32_t)sender_receivers.size() - 1, - (uint32_t)(i * cfg.num_pages), - (uint32_t)input_buffer->address(), - (uint32_t)output_buffers[i]->address(), - (uint32_t)cfg.num_pages, - (uint32_t)cfg.page_size_bytes, - (uint32_t)sem_l1_byte_address}); - llrt::write_hex_vec_to_core( - device->id(), device->ethernet_core_from_logical_core(eth_sender_core), std::vector{INVALID}, sem_l1_byte_address); - - llrt::write_hex_vec_to_core( - device->id(), device->ethernet_core_from_logical_core(eth_receiver_core), std::vector{INVALID}, sem_l1_byte_address); - - auto eth_receiver_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_eth_ring_gather_receive.cpp", - eth_receiver_core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_1, - .compile_args = { - uint32_t(device->ethernet_core_from_logical_core(eth_sender_core).x), - uint32_t(device->ethernet_core_from_logical_core(eth_sender_core).y), - uint32_t( - output_buffers[i]->buffer_type() == tt_metal::BufferType::DRAM)}}); // probably want to use NOC_1 here - - tt_metal::SetRuntimeArgs( - program, - eth_receiver_kernel, - eth_receiver_core, - {(uint32_t)dst_eth_l1_byte_address, - (uint32_t)cfg.size_bytes + 32, // + 32 for idx - (uint32_t)sender_receivers.size() - 1, - (uint32_t)output_buffers[i]->address(), - (uint32_t)cfg.num_pages, - (uint32_t)cfg.page_size_bytes, - (uint32_t)sem_l1_byte_address}); - } - - //////////////////////////////////////////////////////////////////////////// - // Compile and Execute Application - //////////////////////////////////////////////////////////////////////////// - - std::vector> cqs; - for (uint32_t i = 0; i < sender_receivers.size(); ++i) { - const auto& device = std::get<0>(sender_receivers[i]); - tt::tt_metal::detail::CompileProgram(device, programs.at(device->id())); - auto& cq = device->command_queue(); - - EnqueueProgram(cq, programs.at(device->id()), false); - cqs.emplace_back(cq); - } - for (auto& cq : cqs) { - Finish(cq); - } - - for (uint32_t i = 0; i < sender_receivers.size(); ++i) { - const auto& device = std::get<0>(sender_receivers[i]); - const auto& core = std::get<2>(sender_receivers[i]); - std::vector readback_vec; - tt_metal::detail::ReadFromBuffer(output_buffers[i], readback_vec); - auto a = std::mismatch(full_input.begin(), full_input.end(), readback_vec.begin()); - bool p = (a.first == full_input.end()); - pass &= p; - if (not p) { - log_error(tt::LogTest, "Mismatch on Device {} at Core: {}", device->id(), core.str()); - log_error( - tt::LogTest, "Position: {} Expected: {} Read: {}", a.first - full_input.begin(), *a.first, *a.second); - } - } - - return pass; -} -} // namespace fd_unit_tests::erisc::kernels - -TEST_F(CommandQueueMultiDeviceFixture, EthKernelsDirectRingGatherAllChips) { - using namespace CMAKE_UNIQUE_NAMESPACE; - if (num_devices_ < 4) { - GTEST_SKIP(); - } - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; - const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; - const size_t sem_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - const auto& device_ring = get_device_ring(devices_); - if (device_ring.empty()) { - GTEST_SKIP(); - } - ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_ring_gather_sender_receiver_kernels( - device_ring, WORD_SIZE, src_eth_l1_byte_address, dst_eth_l1_byte_address, sem_l1_byte_address)); -} - -TEST_F(CommandQueueMultiDeviceFixture, EthKernelsInterleavedRingGatherAllChips) { - using namespace CMAKE_UNIQUE_NAMESPACE; - if (num_devices_ < 4) { - GTEST_SKIP(); - } - const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; - const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32; - const size_t sem_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; - BankedConfig test_config = - BankedConfig{.num_pages = 10, .size_bytes = 10 * 2 * 32 * 32, .page_size_bytes = 2 * 32 * 32}; - const auto& device_ring = get_device_ring(devices_); - if (device_ring.empty()) { - GTEST_SKIP(); - } - ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_interleaved_ring_gather_sender_receiver_kernels( - device_ring, test_config, src_eth_l1_byte_address, dst_eth_l1_byte_address, sem_l1_byte_address)); -} diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/test_sub_device.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/test_sub_device.cpp deleted file mode 100644 index 83ef6759c4d..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/test_sub_device.cpp +++ /dev/null @@ -1,585 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include -#include -#include -#include - -#include "command_queue_fixture.hpp" -#include "gtest/gtest.h" -#include "tt_metal/common/core_coord.hpp" -#include "tt_metal/impl/buffers/global_semaphore.hpp" -#include "tt_metal/impl/device/device.hpp" -#include "tt_metal/impl/event/event.hpp" -#include "tt_metal/impl/sub_device/sub_device.hpp" -#include "tests/tt_metal/test_utils/stimulus.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp" - -using namespace tt::tt_metal; - -namespace basic_tests { - -std::tuple> create_single_sync_program(Device *device, SubDevice sub_device) { - auto syncer_coord = sub_device.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord; - auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord)); - auto global_sem = CreateGlobalSemaphore(device, sub_device.cores(HalProgrammableCoreType::TENSIX), INVALID); - - Program syncer_program = CreateProgram(); - auto syncer_kernel = CreateKernel( - syncer_program, - "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/syncer.cpp", - syncer_core, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default}); - std::array syncer_rt_args = {global_sem->address()}; - SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args); - return {std::move(syncer_program), std::move(syncer_coord), std::move(global_sem)}; -} - -std::tuple> create_basic_sync_program(Device *device, const SubDevice& sub_device_1, const SubDevice& sub_device_2) { - auto waiter_coord = sub_device_2.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord; - auto waiter_core = CoreRangeSet(CoreRange(waiter_coord, waiter_coord)); - auto waiter_core_physical = device->worker_core_from_logical_core(waiter_coord); - auto incrementer_cores = sub_device_1.cores(HalProgrammableCoreType::TENSIX); - auto syncer_coord = incrementer_cores.ranges().back().end_coord; - auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord)); - auto syncer_core_physical = device->worker_core_from_logical_core(syncer_coord); - auto all_cores = waiter_core.merge(incrementer_cores).merge(syncer_core); - auto global_sem = CreateGlobalSemaphore(device, all_cores, INVALID); - - Program waiter_program = CreateProgram(); - auto waiter_kernel = CreateKernel( - waiter_program, - "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_waiter.cpp", - waiter_core, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default}); - std::array waiter_rt_args = {global_sem->address(), incrementer_cores.num_cores(), syncer_core_physical.x, syncer_core_physical.y}; - SetRuntimeArgs(waiter_program, waiter_kernel, waiter_core, waiter_rt_args); - - Program syncer_program = CreateProgram(); - auto syncer_kernel = CreateKernel( - syncer_program, - "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/syncer.cpp", - syncer_core, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default}); - std::array syncer_rt_args = {global_sem->address()}; - SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args); - - Program incrementer_program = CreateProgram(); - auto incrementer_kernel = CreateKernel( - incrementer_program, - "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/incrementer.cpp", - incrementer_cores, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, - .noc = NOC::RISCV_1_default}); - std::array incrementer_rt_args = {global_sem->address(), waiter_core_physical.x, waiter_core_physical.y}; - SetRuntimeArgs(incrementer_program, incrementer_kernel, incrementer_cores, incrementer_rt_args); - return {std::move(waiter_program), std::move(syncer_program), std::move(incrementer_program), std::move(global_sem)}; -} - -std::tuple> create_basic_eth_sync_program(Device *device, const SubDevice& sub_device_1, const SubDevice& sub_device_2) { - auto waiter_coord = sub_device_2.cores(HalProgrammableCoreType::ACTIVE_ETH).ranges().at(0).start_coord; - auto waiter_core = CoreRangeSet(CoreRange(waiter_coord, waiter_coord)); - auto waiter_core_physical = device->ethernet_core_from_logical_core(waiter_coord); - auto tensix_waiter_coord = sub_device_2.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord; - auto tensix_waiter_core = CoreRangeSet(CoreRange(tensix_waiter_coord, tensix_waiter_coord)); - auto tensix_waiter_core_physical = device->worker_core_from_logical_core(tensix_waiter_coord); - auto incrementer_cores = sub_device_1.cores(HalProgrammableCoreType::TENSIX); - auto syncer_coord = incrementer_cores.ranges().back().end_coord; - auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord)); - auto syncer_core_physical = device->worker_core_from_logical_core(syncer_coord); - auto all_cores = tensix_waiter_core.merge(incrementer_cores).merge(syncer_core); - auto global_sem = CreateGlobalSemaphore(device, all_cores, INVALID); - - Program waiter_program = CreateProgram(); - auto waiter_kernel = CreateKernel( - waiter_program, - "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_remote_waiter.cpp", - waiter_core, - EthernetConfig{ - .noc = NOC::RISCV_0_default, - .processor = DataMovementProcessor::RISCV_0}); - std::array waiter_rt_args = {global_sem->address(), incrementer_cores.num_cores(), syncer_core_physical.x, syncer_core_physical.y, tensix_waiter_core_physical.x, tensix_waiter_core_physical.y, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE}; - SetRuntimeArgs(waiter_program, waiter_kernel, waiter_core, waiter_rt_args); - - Program syncer_program = CreateProgram(); - auto syncer_kernel = CreateKernel( - syncer_program, - "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/syncer.cpp", - syncer_core, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, - .noc = NOC::RISCV_0_default}); - std::array syncer_rt_args = {global_sem->address()}; - SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args); - - Program incrementer_program = CreateProgram(); - auto incrementer_kernel = CreateKernel( - incrementer_program, - "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/incrementer.cpp", - incrementer_cores, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, - .noc = NOC::RISCV_1_default}); - std::array incrementer_rt_args = {global_sem->address(), tensix_waiter_core_physical.x, tensix_waiter_core_physical.y}; - SetRuntimeArgs(incrementer_program, incrementer_kernel, incrementer_cores, incrementer_rt_args); - return {std::move(waiter_program), std::move(syncer_program), std::move(incrementer_program), std::move(global_sem)}; -} - -TEST_F(CommandQueueSingleCardFixture, TestSubDeviceAllocations) { - uint32_t local_l1_size = 3200; - SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); - SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); - CoreRangeSet sharded_cores_1 = CoreRange({0, 0}, {2, 2}); - CoreRangeSet sharded_cores_2 = CoreRange({4, 4}, {4, 4}); - - auto sharded_cores_1_vec = corerange_to_cores(sharded_cores_1, std::nullopt, true); - auto sharded_cores_2_vec = corerange_to_cores(sharded_cores_2, std::nullopt, true); - - ShardSpecBuffer shard_spec_buffer_1 = ShardSpecBuffer(sharded_cores_1, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_1.num_cores(), 1}); - uint32_t page_size_1 = 32; - ShardedBufferConfig shard_config_1 = {nullptr, sharded_cores_1.num_cores() * page_size_1, page_size_1, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_1}; - auto input_1 = tt::test_utils::generate_uniform_random_vector(0, 100, shard_config_1.size / sizeof(uint32_t)); - - ShardSpecBuffer shard_spec_buffer_2 = ShardSpecBuffer(sharded_cores_2, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_2.num_cores(), 1}); - uint32_t page_size_2 = 64; - ShardedBufferConfig shard_config_2 = {nullptr, sharded_cores_2.num_cores() * page_size_2, page_size_2, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_2}; - auto input_2 = tt::test_utils::generate_uniform_random_vector(0, 100, shard_config_2.size / sizeof(uint32_t)); - - uint32_t page_size_3 = 1024; - InterleavedBufferConfig interleaved_config = {nullptr, page_size_3, page_size_3, BufferType::L1, TensorMemoryLayout::INTERLEAVED}; - auto input_3 = tt::test_utils::generate_uniform_random_vector(0, 100, interleaved_config.size / sizeof(uint32_t)); - - for (Device *device : devices_) { - auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1}, local_l1_size); - auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_1, sub_device_2}, local_l1_size); - DeviceAddr l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1); - DeviceAddr max_addr = l1_unreserved_base + local_l1_size; - - shard_config_1.device = device; - shard_config_2.device = device; - interleaved_config.device = device; - - std::vector physical_cores_1; - physical_cores_1.reserve(sharded_cores_1_vec.size()); - for (const auto& core : sharded_cores_1_vec) { - physical_cores_1.push_back(device->worker_core_from_logical_core(core)); - } - - std::vector physical_cores_2; - physical_cores_2.reserve(sharded_cores_2_vec.size()); - for (const auto& core : sharded_cores_2_vec) { - physical_cores_2.push_back(device->worker_core_from_logical_core(core)); - } - - device->load_sub_device_manager(sub_device_manager_1); - - auto buffer_1 = CreateBuffer(shard_config_1, SubDeviceId{0}); - EXPECT_EQ(buffer_1->address(), max_addr - buffer_1->aligned_page_size()); - EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, false); - std::vector output_1; - EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true); - EXPECT_EQ(input_1, output_1); - auto input_1_it = input_1.begin(); - for (const auto& physical_core : physical_cores_1) { - auto readback = tt::llrt::read_hex_vec_from_core( - device->id(), physical_core, buffer_1->address(), page_size_1); - EXPECT_TRUE(std::equal(input_1_it, input_1_it + page_size_1 / sizeof(uint32_t), readback.begin())); - input_1_it += page_size_1 / sizeof(uint32_t); - } - - auto buffer_2 = CreateBuffer(interleaved_config); - EXPECT_THROW(CreateBuffer(shard_config_1, SubDeviceId{1}), std::exception); - EXPECT_THROW(device->clear_loaded_sub_device_manager(), std::exception); - EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception); - DeallocateBuffer(*buffer_1); - device->clear_loaded_sub_device_manager(); - device->load_sub_device_manager(sub_device_manager_2); - - auto buffer_3 = CreateBuffer(shard_config_2, SubDeviceId{1}); - EXPECT_EQ(buffer_3->address(), max_addr - buffer_3->aligned_page_size()); - EnqueueWriteBuffer(device->command_queue(), buffer_3, input_2, false); - std::vector output_2; - EnqueueReadBuffer(device->command_queue(), buffer_3, output_2, true); - EXPECT_EQ(input_2, output_2); - auto input_2_it = input_2.begin(); - for (const auto& physical_core : physical_cores_2) { - auto readback = tt::llrt::read_hex_vec_from_core( - device->id(), physical_core, buffer_3->address(), page_size_2); - EXPECT_TRUE(std::equal(input_2_it, input_2_it + page_size_2 / sizeof(uint32_t), readback.begin())); - input_2_it += page_size_2 / sizeof(uint32_t); - } - - auto buffer_4 = CreateBuffer(shard_config_1, SubDeviceId{0}); - EXPECT_EQ(buffer_4->address(), max_addr - buffer_4->aligned_page_size()); - EXPECT_THROW(CreateBuffer(interleaved_config, SubDeviceId{0}), std::exception); - } -} - -TEST_F(CommandQueueSingleCardFixture, TestSubDeviceSynchronization) { - uint32_t local_l1_size = 3200; - SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); - SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); - CoreRangeSet sharded_cores_1 = CoreRange({0, 0}, {2, 2}); - - auto sharded_cores_1_vec = corerange_to_cores(sharded_cores_1, std::nullopt, true); - - ShardSpecBuffer shard_spec_buffer_1 = ShardSpecBuffer(sharded_cores_1, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_1.num_cores(), 1}); - uint32_t page_size_1 = 32; - ShardedBufferConfig shard_config_1 = {nullptr, sharded_cores_1.num_cores() * page_size_1, page_size_1, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_1}; - auto input_1 = tt::test_utils::generate_uniform_random_vector(0, 100, shard_config_1.size / sizeof(uint32_t)); - - std::array sub_device_ids_to_block = {SubDeviceId{0}}; - for (Device *device : devices_) { - auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, local_l1_size); - - shard_config_1.device = device; - - std::vector physical_cores_1; - physical_cores_1.reserve(sharded_cores_1_vec.size()); - for (const auto& core : sharded_cores_1_vec) { - physical_cores_1.push_back(device->worker_core_from_logical_core(core)); - } - - device->load_sub_device_manager(sub_device_manager); - - auto [program, syncer_core, global_semaphore] = create_single_sync_program(device, sub_device_2); - EnqueueProgram(device->command_queue(), program, false); - - auto buffer_1 = CreateBuffer(shard_config_1, sub_device_ids_to_block[0]); - - // Test blocking synchronize doesn't stall - Synchronize(device, 0, sub_device_ids_to_block); - - // Test blocking write buffer doesn't stall - EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, true, sub_device_ids_to_block); - - // Test record event won't cause a stall - auto event = std::make_shared(); - EnqueueRecordEvent(device->command_queue(), event, sub_device_ids_to_block); - Synchronize(device, 0, sub_device_ids_to_block); - - // Test blocking read buffer doesn't stall - std::vector output_1; - EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true, sub_device_ids_to_block); - EXPECT_EQ(input_1, output_1); - auto input_1_it = input_1.begin(); - for (const auto& physical_core : physical_cores_1) { - auto readback = tt::llrt::read_hex_vec_from_core( - device->id(), physical_core, buffer_1->address(), page_size_1); - EXPECT_TRUE(std::equal(input_1_it, input_1_it + page_size_1 / sizeof(uint32_t), readback.begin())); - input_1_it += page_size_1 / sizeof(uint32_t); - } - auto sem_addr = global_semaphore->address(); - auto physical_syncer_core = device->worker_core_from_logical_core(syncer_core); - tt::llrt::write_hex_vec_to_core(device->id(), physical_syncer_core, std::vector{1}, sem_addr); - - // Full synchronization - Synchronize(device); - } -} - -TEST_F(CommandQueueSingleCardFixture, TestSubDeviceBasicPrograms) { - SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); - SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); - uint32_t num_iters = 5; - for (Device *device : devices_) { - auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); - device->load_sub_device_manager(sub_device_manager); - - auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_sync_program(device, sub_device_1, sub_device_2); - - for (uint32_t i = 0; i < num_iters; i++) { - EnqueueProgram(device->command_queue(), waiter_program, false); - // Test blocking on one sub-device - EnqueueProgram(device->command_queue(), syncer_program, true); - EnqueueProgram(device->command_queue(), incrementer_program, false); - } - Synchronize(device); - } -} - -TEST_F(CommandQueueSingleCardFixture, TestSubDeviceBasicEthPrograms) { - SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); - uint32_t num_iters = 5; - for (Device *device : devices_) { - if (!does_device_have_active_eth_cores(device)) { - GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores"; - } - auto eth_core = *device->get_active_ethernet_cores(true).begin(); - SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})}), CoreRangeSet(CoreRange(eth_core, eth_core))}); - auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); - device->load_sub_device_manager(sub_device_manager); - - auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_eth_sync_program(device, sub_device_1, sub_device_2); - - for (uint32_t i = 0; i < num_iters; i++) { - EnqueueProgram(device->command_queue(), waiter_program, false); - // Test blocking on one sub-device - EnqueueProgram(device->command_queue(), syncer_program, true); - EnqueueProgram(device->command_queue(), incrementer_program, false); - } - Synchronize(device); - } -} - -TEST_F(CommandQueueSingleCardTraceFixture, TestSubDeviceTraceBasicPrograms) { - SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); - SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); - uint32_t num_iters = 5; - for (Device *device : devices_) { - auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); - device->load_sub_device_manager(sub_device_manager); - - auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_sync_program(device, sub_device_1, sub_device_2); - - // Compile the programs - EnqueueProgram(device->command_queue(), waiter_program, false); - // Test blocking on one sub-device - EnqueueProgram(device->command_queue(), syncer_program, true); - EnqueueProgram(device->command_queue(), incrementer_program, false); - Synchronize(device); - - // Capture the trace - auto tid_1 = BeginTraceCapture(device, device->command_queue().id()); - EnqueueProgram(device->command_queue(), waiter_program, false); - EnqueueProgram(device->command_queue(), syncer_program, false); - EnqueueProgram(device->command_queue(), incrementer_program, false); - EndTraceCapture(device, device->command_queue().id(), tid_1); - - auto tid_2 = BeginTraceCapture(device, device->command_queue().id()); - EnqueueProgram(device->command_queue(), syncer_program, false); - EnqueueProgram(device->command_queue(), incrementer_program, false); - EndTraceCapture(device, device->command_queue().id(), tid_2); - - for (uint32_t i = 0; i < num_iters; i++) { - // Regular program execution - EnqueueProgram(device->command_queue(), waiter_program, false); - // Test blocking on one sub-device - EnqueueProgram(device->command_queue(), syncer_program, true); - EnqueueProgram(device->command_queue(), incrementer_program, false); - - // Full trace execution - ReplayTrace(device, device->command_queue().id(), tid_1, false); - - // Partial trace execution - EnqueueProgram(device->command_queue(), waiter_program, false); - ReplayTrace(device, device->command_queue().id(), tid_2, false); - } - Synchronize(device); - } -} - -TEST_F(CommandQueueSingleCardTraceFixture, TestSubDeviceTraceBasicEthPrograms) { - SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); - uint32_t num_iters = 5; - for (Device *device : devices_) { - if (!does_device_have_active_eth_cores(device)) { - GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores"; - } - auto eth_core = *device->get_active_ethernet_cores(true).begin(); - SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})}), CoreRangeSet(CoreRange(eth_core, eth_core))}); - auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); - device->load_sub_device_manager(sub_device_manager); - - auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_eth_sync_program(device, sub_device_1, sub_device_2); - - // Compile the programs - EnqueueProgram(device->command_queue(), waiter_program, false); - // Test blocking on one sub-device - EnqueueProgram(device->command_queue(), syncer_program, true); - EnqueueProgram(device->command_queue(), incrementer_program, false); - Synchronize(device); - - // Capture the trace - auto tid_1 = BeginTraceCapture(device, device->command_queue().id()); - EnqueueProgram(device->command_queue(), waiter_program, false); - EnqueueProgram(device->command_queue(), syncer_program, false); - EnqueueProgram(device->command_queue(), incrementer_program, false); - EndTraceCapture(device, device->command_queue().id(), tid_1); - - auto tid_2 = BeginTraceCapture(device, device->command_queue().id()); - EnqueueProgram(device->command_queue(), syncer_program, false); - EnqueueProgram(device->command_queue(), incrementer_program, false); - EndTraceCapture(device, device->command_queue().id(), tid_2); - - for (uint32_t i = 0; i < num_iters; i++) { - // Regular program execution - EnqueueProgram(device->command_queue(), waiter_program, false); - // Test blocking on one sub-device - EnqueueProgram(device->command_queue(), syncer_program, true); - EnqueueProgram(device->command_queue(), incrementer_program, false); - - // Full trace execution - ReplayTrace(device, device->command_queue().id(), tid_1, false); - - // Partial trace execution - EnqueueProgram(device->command_queue(), waiter_program, false); - ReplayTrace(device, device->command_queue().id(), tid_2, false); - } - Synchronize(device); - } -} - -TEST_F(CommandQueueSingleCardTraceFixture, TestSubDeviceTraceProgramsReconfigureSubDevices) { - SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); - SubDevice sub_device_2(std::array{CoreRangeSet(std::array{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); - SubDevice sub_device_3(std::array{CoreRangeSet(std::array{CoreRange({2, 4}, {3, 4}), CoreRange({5, 1}, {6, 3})})}); - uint32_t num_iters = 5; - for (Device *device : devices_) { - if (!does_device_have_active_eth_cores(device)) { - GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores"; - } - auto eth_core = *device->get_active_ethernet_cores(true).begin(); - SubDevice sub_device_4(std::array{CoreRangeSet(std::array{CoreRange({2, 1}, {2, 2}), CoreRange({1, 5}, {5, 5})}), CoreRangeSet(CoreRange(eth_core, eth_core))}); - - auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); - auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_3, sub_device_4}, 3200); - - device->load_sub_device_manager(sub_device_manager_1); - - auto [waiter_program_1, syncer_program_1, incrementer_program_1, global_sem_1] = create_basic_sync_program(device, sub_device_1, sub_device_2); - - // Compile the programs - EnqueueProgram(device->command_queue(), waiter_program_1, false); - EnqueueProgram(device->command_queue(), syncer_program_1, false); - EnqueueProgram(device->command_queue(), incrementer_program_1, false); - Synchronize(device); - - // Capture the trace - auto tid_1 = BeginTraceCapture(device, device->command_queue().id()); - EnqueueProgram(device->command_queue(), waiter_program_1, false); - EnqueueProgram(device->command_queue(), syncer_program_1, false); - EnqueueProgram(device->command_queue(), incrementer_program_1, false); - EndTraceCapture(device, device->command_queue().id(), tid_1); - - auto tid_2 = BeginTraceCapture(device, device->command_queue().id()); - EnqueueProgram(device->command_queue(), syncer_program_1, false); - EnqueueProgram(device->command_queue(), incrementer_program_1, false); - EndTraceCapture(device, device->command_queue().id(), tid_2); - - device->load_sub_device_manager(sub_device_manager_2); - - auto [waiter_program_2, syncer_program_2, incrementer_program_2, global_sem_2] = create_basic_eth_sync_program(device, sub_device_3, sub_device_4); - - // Compile the programs - EnqueueProgram(device->command_queue(), waiter_program_2, false); - EnqueueProgram(device->command_queue(), syncer_program_2, false); - EnqueueProgram(device->command_queue(), incrementer_program_2, false); - Synchronize(device); - - // Capture the trace - auto tid_3 = BeginTraceCapture(device, device->command_queue().id()); - EnqueueProgram(device->command_queue(), waiter_program_2, false); - EnqueueProgram(device->command_queue(), syncer_program_2, false); - EnqueueProgram(device->command_queue(), incrementer_program_2, false); - EndTraceCapture(device, device->command_queue().id(), tid_3); - - auto tid_4 = BeginTraceCapture(device, device->command_queue().id()); - EnqueueProgram(device->command_queue(), syncer_program_2, false); - EnqueueProgram(device->command_queue(), incrementer_program_2, false); - EndTraceCapture(device, device->command_queue().id(), tid_4); - - for (uint32_t i = 0; i < num_iters; i++) { - device->load_sub_device_manager(sub_device_manager_1); - // Regular program execution - EnqueueProgram(device->command_queue(), waiter_program_1, false); - // Test blocking on one sub-device - EnqueueProgram(device->command_queue(), syncer_program_1, false); - EnqueueProgram(device->command_queue(), incrementer_program_1, false); - - // Full trace execution - ReplayTrace(device, device->command_queue().id(), tid_1, false); - - // Partial trace execution - EnqueueProgram(device->command_queue(), waiter_program_1, false); - ReplayTrace(device, device->command_queue().id(), tid_2, false); - - device->load_sub_device_manager(sub_device_manager_2); - // Regular program execution - EnqueueProgram(device->command_queue(), waiter_program_2, false); - // Test blocking on one sub-device - EnqueueProgram(device->command_queue(), syncer_program_2, false); - EnqueueProgram(device->command_queue(), incrementer_program_2, false); - - // Full trace execution - ReplayTrace(device, device->command_queue().id(), tid_3, false); - - // Partial trace execution - EnqueueProgram(device->command_queue(), waiter_program_2, false); - ReplayTrace(device, device->command_queue().id(), tid_4, false); - } - Synchronize(device); - } -} - -TEST_F(CommandQueueSingleCardTraceFixture, TestSubDeviceIllegalOperations) { - SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))}); - SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})}); - - // Assert no idle eth cores specified - EXPECT_THROW(SubDevice sub_device_3(std::array{CoreRangeSet(CoreRange({3, 3}, {3, 3})), CoreRangeSet(CoreRange({4, 4}, {4, 4})), CoreRangeSet(CoreRange({5, 5}, {5, 5}))}), std::exception); - for (Device *device : devices_) { - auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200); - auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_2, sub_device_1}, 3200); - device->load_sub_device_manager(sub_device_manager_1); - - auto [waiter_program_1, syncer_program_1, incrementer_program_1, global_sem_1] = create_basic_sync_program(device, sub_device_1, sub_device_2); - - // Compile the programs - EnqueueProgram(device->command_queue(), waiter_program_1, false); - // Test blocking on one sub-device - EnqueueProgram(device->command_queue(), syncer_program_1, false); - EnqueueProgram(device->command_queue(), incrementer_program_1, false); - Synchronize(device); - - // Capture the trace - auto tid_1 = BeginTraceCapture(device, device->command_queue().id()); - // Can not load a sub-device manager while tracing - EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception); - EnqueueProgram(device->command_queue(), waiter_program_1, false); - EnqueueProgram(device->command_queue(), syncer_program_1, false); - EnqueueProgram(device->command_queue(), incrementer_program_1, false); - EndTraceCapture(device, device->command_queue().id(), tid_1); - - device->load_sub_device_manager(sub_device_manager_2); - auto [waiter_program_2, syncer_program_2, incrementer_program_2, global_sem_2] = create_basic_sync_program(device, sub_device_2, sub_device_1); - - EnqueueProgram(device->command_queue(), waiter_program_2, false); - EnqueueProgram(device->command_queue(), syncer_program_2, false); - EnqueueProgram(device->command_queue(), incrementer_program_2, false); - Synchronize(device); - - auto tid_2 = BeginTraceCapture(device, device->command_queue().id()); - EnqueueProgram(device->command_queue(), waiter_program_2, false); - EnqueueProgram(device->command_queue(), syncer_program_2, false); - EnqueueProgram(device->command_queue(), incrementer_program_2, false); - EndTraceCapture(device, device->command_queue().id(), tid_2); - - // Regular program execution - // Can not run a program on a different sub-device manager - EXPECT_THROW(EnqueueProgram(device->command_queue(), waiter_program_1, false), std::exception); - - // Full trace execution - ReplayTrace(device, device->command_queue().id(), tid_2, false); - - // Can not replay a trace on a different sub-device manager - EXPECT_THROW(ReplayTrace(device, device->command_queue().id(), tid_1, false), std::exception); - - Synchronize(device); - - device->remove_sub_device_manager(sub_device_manager_1); - EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_1), std::exception); - } -} - -} // namespace basic_tests diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/tests_main.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/tests_main.cpp deleted file mode 100644 index 1e42f41a46c..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/tests_main.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "gtest/gtest.h" diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt deleted file mode 100644 index 00e5b547319..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt +++ /dev/null @@ -1,26 +0,0 @@ -set(UNIT_TESTS_FD_SINGLEC_MULTIQ_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueProgram.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueTrace.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWaitForEvent.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp -) - -add_executable(unit_tests_fast_dispatch_single_chip_multi_queue ${UNIT_TESTS_FD_SINGLEC_MULTIQ_SRCS}) -TT_ENABLE_UNITY_BUILD(unit_tests_fast_dispatch_single_chip_multi_queue) - -target_link_libraries(unit_tests_fast_dispatch_single_chip_multi_queue PUBLIC test_metal_common_libs) -target_include_directories( - unit_tests_fast_dispatch_single_chip_multi_queue - PRIVATE - ${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/tt_metal - ${PROJECT_SOURCE_DIR}/tests - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/common -) -set_target_properties( - unit_tests_fast_dispatch_single_chip_multi_queue - PROPERTIES - RUNTIME_OUTPUT_DIRECTORY - ${PROJECT_BINARY_DIR}/test/tt_metal -) diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp deleted file mode 100644 index 4e407df6d4e..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp +++ /dev/null @@ -1,273 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "command_queue_fixture.hpp" -#include "command_queue_test_utils.hpp" -#include "gtest/gtest.h" -#include "impl/buffers/buffer.hpp" -#include "impl/device/device.hpp" -#include "tt_metal/common/bfloat16.hpp" -#include "tt_metal/common/scoped_timer.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/tt_metal.hpp" - -using std::vector; -using namespace tt::tt_metal; - -struct CBConfig { - uint32_t cb_id; - uint32_t num_pages; - uint32_t page_size; - tt::DataFormat data_format; -}; - -struct DummyProgramConfig { - CoreRangeSet cr_set; - CBConfig cb_config; - uint32_t num_cbs; - uint32_t num_sems; -}; - -struct DummyProgramMultiCBConfig { - CoreRangeSet cr_set; - std::vector cb_config_vector; - uint32_t num_sems; -}; - - -namespace local_test_functions { - -// Create randomly sized pair of unique and common runtime args vectors, with careful not to exceed max between the two. -// Optionally force the max size for one of the vectors. -std::pair, std::vector> create_runtime_args(bool force_max_size = false, uint32_t unique_base = 0, uint32_t common_base = 100){ - - constexpr uint32_t MAX_RUNTIME_ARGS = 255; - - // Generate Unique Runtime Args. Common RT args starting address must be L1 Aligned, so account for that here via padding - uint32_t num_rt_args_unique = num_rt_args_unique = rand() % (MAX_RUNTIME_ARGS + 1); - uint32_t num_rt_args_unique_padded = align(num_rt_args_unique, hal.get_alignment(HalMemType::L1) / sizeof(uint32_t)); - uint32_t num_rt_args_common = num_rt_args_unique_padded < MAX_RUNTIME_ARGS ? rand() % (MAX_RUNTIME_ARGS - num_rt_args_unique_padded + 1) : 0; - - if (force_max_size) { - if (rand() % 2) { - num_rt_args_unique = MAX_RUNTIME_ARGS; - num_rt_args_common = 0; - } else { - num_rt_args_common = MAX_RUNTIME_ARGS; - num_rt_args_unique = 0; - } - } - - vector rt_args_common; - for (uint32_t i = 0; i < num_rt_args_common; i++) { - rt_args_common.push_back(common_base + i); - } - - vector rt_args_unique; - for (uint32_t i = 0; i < num_rt_args_unique; i++) { - rt_args_unique.push_back(unique_base + i); - } - - log_trace(tt::LogTest, "{} - num_rt_args_unique: {} num_rt_args_common: {} force_max_size: {}", __FUNCTION__, num_rt_args_unique, num_rt_args_common, force_max_size); - return std::make_pair(rt_args_unique, rt_args_common); -} - - -} // namespace local_test_functions - -namespace stress_tests { - -TEST_F(MultiCommandQueueSingleDeviceFixture, TestRandomizedProgram) { - uint32_t NUM_PROGRAMS = 100; - uint32_t MAX_LOOP = 100; - uint32_t page_size = 1024; - - if (this->arch_ == tt::ARCH::BLACKHOLE) { - GTEST_SKIP(); // Running on second CQ is hanging on CI - } - - // Make random - auto random_seed = 0; // (unsigned int)time(NULL); - uint32_t seed = tt::parse_env("SEED", random_seed); - log_info(tt::LogTest, "Using Test Seed: {}", seed); - srand(seed); - - CoreCoord worker_grid_size = this->device_->compute_with_storage_grid_size(); - CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1}); - CoreRangeSet cr_set({cr}); - - log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS); - - vector programs; - for (uint32_t i = 0; i < NUM_PROGRAMS; i++) { - programs.push_back(Program()); - Program& program = programs.back(); - - std::map data_movement_defines = {{"DATA_MOVEMENT", "1"}}; - std::map compute_defines = {{"COMPUTE", "1"}}; - - // brisc - uint32_t BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS; - bool USE_MAX_RT_ARGS; - - if (i == 0) { - // Ensures that we get at least one compilation with the max amount to - // ensure it compiles and runs - BRISC_OUTER_LOOP = MAX_LOOP; - BRISC_MIDDLE_LOOP = MAX_LOOP; - BRISC_INNER_LOOP = MAX_LOOP; - NUM_CBS = NUM_CIRCULAR_BUFFERS; - NUM_SEMS = NUM_SEMAPHORES; - USE_MAX_RT_ARGS = true; - } else { - BRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; - BRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; - BRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; - NUM_CBS = rand() % (NUM_CIRCULAR_BUFFERS) + 1; - NUM_SEMS = rand() % (NUM_SEMAPHORES) + 1; - USE_MAX_RT_ARGS = false; - } - - log_debug(tt::LogTest, "Compiling program {}/{} w/ BRISC_OUTER_LOOP: {} BRISC_MIDDLE_LOOP: {} BRISC_INNER_LOOP: {} NUM_CBS: {} NUM_SEMS: {} USE_MAX_RT_ARGS: {}", - i+1, NUM_PROGRAMS, BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, USE_MAX_RT_ARGS); - - for (uint32_t j = 0; j < NUM_CBS; j++) { - CircularBufferConfig cb_config = CircularBufferConfig(page_size * (j + 1), {{j, tt::DataFormat::Float16_b}}).set_page_size(j, page_size * (j + 1)); - auto cb = CreateCircularBuffer(program, cr_set, cb_config); - } - - for (uint32_t j = 0; j < NUM_SEMS; j++) { - CreateSemaphore(program, cr_set, j + 1); - } - - auto [brisc_unique_rtargs, brisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS); - uint32_t num_brisc_unique_rtargs = brisc_unique_rtargs.size(); - uint32_t num_brisc_common_rtargs = brisc_common_rtargs.size(); - vector brisc_compile_args = {BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_brisc_unique_rtargs, num_brisc_common_rtargs, page_size}; - - // ncrisc - uint32_t NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP; - if (i == 0) { - NCRISC_OUTER_LOOP = MAX_LOOP; - NCRISC_MIDDLE_LOOP = MAX_LOOP; - NCRISC_INNER_LOOP = MAX_LOOP; - } else { - NCRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; - NCRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; - NCRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; - } - - auto [ncrisc_unique_rtargs, ncrisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS); - uint32_t num_ncrisc_unique_rtargs = ncrisc_unique_rtargs.size(); - uint32_t num_ncrisc_common_rtargs = ncrisc_common_rtargs.size(); - vector ncrisc_compile_args = {NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_ncrisc_unique_rtargs, num_ncrisc_common_rtargs, page_size}; - - // trisc - uint32_t TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP; - if (i == 0) { - TRISC_OUTER_LOOP = MAX_LOOP; - TRISC_MIDDLE_LOOP = MAX_LOOP; - TRISC_INNER_LOOP = MAX_LOOP; - } else { - TRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1; - TRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1; - TRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1; - } - - auto [trisc_unique_rtargs, trisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS); - uint32_t num_trisc_unique_rtargs = trisc_unique_rtargs.size(); - uint32_t num_trisc_common_rtargs = trisc_common_rtargs.size(); - vector trisc_compile_args = {TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_trisc_unique_rtargs, num_trisc_common_rtargs, page_size}; - - bool at_least_one_kernel = false; - if (i == 0 or ((rand() % 2) == 0)) { - auto dummy_brisc_kernel = CreateKernel( - program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines}); - SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs); - at_least_one_kernel = true; - } - - if (i == 0 or ((rand() % 2) == 0)) { - auto dummy_ncrisc_kernel = CreateKernel( - program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines}); - SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs); - at_least_one_kernel = true; - } - - if (i == 0 or ((rand() % 2) == 0)) { - auto dummy_trisc_kernel = CreateKernel( - program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{ - .math_approx_mode = false, - .compile_args = trisc_compile_args, - .defines = compute_defines - }); - SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs); - at_least_one_kernel = true; - } - - if (not at_least_one_kernel) { - uint32_t random_risc = rand() % 3 + 1; - if (random_risc == 1) { - auto dummy_brisc_kernel = CreateKernel( - program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines}); - SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs); - } else if (random_risc == 2) { - auto dummy_ncrisc_kernel = CreateKernel( - program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines}); - SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs); - } else if (random_risc == 3) { - auto dummy_trisc_kernel = CreateKernel( - program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{ - .math_approx_mode = false, - .compile_args = trisc_compile_args, - .defines = compute_defines - }); - SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs); - SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs); - } else { - TT_THROW("Invalid"); - } - } - - tt::tt_metal::detail::CompileProgram(this->device_, program); - } - - for (uint8_t cq_id = 0; cq_id < this->device_->num_hw_cqs(); ++cq_id) { - log_info(tt::LogTest, "Running {} programs on cq {} for cache warmup.", programs.size(), (uint32_t)cq_id); - // This loop caches program and runs - for (Program& program: programs) { - EnqueueProgram(this->device_->command_queue(cq_id), program, false); - } - - // This loops assumes already cached - uint32_t NUM_ITERATIONS = 500; // TODO(agrebenisan): Bump this to 5000, saw hangs for very large number of iterations, need to come back to that - - log_info(tt::LogTest, "Running {} programs on cq {} for {} iterations now.", programs.size(), (uint32_t)cq_id, NUM_ITERATIONS); - for (uint32_t i = 0; i < NUM_ITERATIONS; i++) { - auto rng = std::default_random_engine {}; - std::shuffle(std::begin(programs), std::end(programs), rng); - if (i % 10 == 0) { - log_debug(tt::LogTest, "Enqueueing {} programs on cq {} for iter: {}/{} now.", programs.size(), (uint32_t)cq_id, i+1, NUM_ITERATIONS); - } - for (Program& program: programs) { - EnqueueProgram(this->device_->command_queue(cq_id), program, false); - } - } - - log_info(tt::LogTest, "Calling Finish."); - Finish(this->device_->command_queue(cq_id)); - } -} - -} // namespace stress_tests diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp deleted file mode 100644 index 0f9c35adb96..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp +++ /dev/null @@ -1,241 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "command_queue_fixture.hpp" -#include "gtest/gtest.h" -#include "tt_metal/common/scoped_timer.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/impl/device/device.hpp" - -using std::vector; -using namespace tt::tt_metal; - -Program create_simple_unary_program(const Buffer& input, const Buffer& output) { - Program program = CreateProgram(); - - CoreCoord worker = {0, 0}; - auto reader_kernel = CreateKernel( - program, - "tt_metal/kernels/dataflow/reader_unary.cpp", - worker, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); - - auto writer_kernel = CreateKernel( - program, - "tt_metal/kernels/dataflow/writer_unary.cpp", - worker, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - auto sfpu_kernel = CreateKernel( - program, - "tt_metal/kernels/compute/eltwise_sfpu.cpp", - worker, - ComputeConfig{ - .math_approx_mode = true, - .compile_args = {1, 1}, - .defines = {{"SFPU_OP_EXP_INCLUDE", "1"}, {"SFPU_OP_CHAIN_0", "exp_tile_init(); exp_tile(0);"}}}); - - CircularBufferConfig input_cb_config = CircularBufferConfig(2048, {{0, tt::DataFormat::Float16_b}}) - .set_page_size(0, 2048); - - CoreRange core_range({0, 0}); - CreateCircularBuffer(program, core_range, input_cb_config); - vector writer_rt_args = { - output.address(), - (uint32_t)output.noc_coordinates().x, - (uint32_t)output.noc_coordinates().y, - output.num_pages() - }; - SetRuntimeArgs(program, writer_kernel, worker, writer_rt_args); - - CircularBufferConfig output_cb_config = CircularBufferConfig(2048, {{16, tt::DataFormat::Float16_b}}) - .set_page_size(16, 2048); - - CreateCircularBuffer(program, core_range, output_cb_config); - vector reader_rt_args = { - input.address(), - (uint32_t)input.noc_coordinates().x, - (uint32_t)input.noc_coordinates().y, - input.num_pages() - }; - SetRuntimeArgs(program, reader_kernel, worker, reader_rt_args); - - return program; -} - -// All basic trace tests just assert that the replayed result exactly matches -// the eager mode results -namespace basic_tests { - -TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTrace) { - Setup(2048, 2); - auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - - CommandQueue& command_queue = this->device_->command_queue(0); - CommandQueue& data_movement_queue = this->device_->command_queue(1); - - Program simple_program = create_simple_unary_program(*input, *output); - vector input_data(input->size() / sizeof(uint32_t), 0); - for (uint32_t i = 0; i < input_data.size(); i++) { - input_data[i] = i; - } - - // Eager mode - vector eager_output_data; - eager_output_data.resize(input_data.size()); - - EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); - EnqueueProgram(command_queue, simple_program, true); - EnqueueReadBuffer(data_movement_queue, output, eager_output_data.data(), true); - - // Trace mode - vector trace_output_data; - trace_output_data.resize(input_data.size()); - - EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); - - uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); - EnqueueProgram(command_queue, simple_program, false); - EndTraceCapture(this->device_, command_queue.id(), tid); - - EnqueueTrace(command_queue, tid, true); - EnqueueReadBuffer(data_movement_queue, *output, trace_output_data.data(), true); - EXPECT_TRUE(eager_output_data == trace_output_data); - - // Done - Finish(command_queue); - ReleaseTrace(this->device_, tid); -} - -TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceLoops) { - Setup(4096, 2); - auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - - CommandQueue& command_queue = this->device_->command_queue(0); - CommandQueue& data_movement_queue = this->device_->command_queue(1); - - Program simple_program = create_simple_unary_program(*input, *output); - vector input_data(input->size() / sizeof(uint32_t), 0); - for (uint32_t i = 0; i < input_data.size(); i++) { - input_data[i] = i; - } - - // Trace mode output - uint32_t num_loops = 10; - vector> trace_outputs; - - for (auto i = 0; i < num_loops; i++) { - trace_outputs.push_back({}); - trace_outputs[i].resize(input_data.size()); - } - - // Compile - EnqueueProgram(command_queue, simple_program, true); - - // Trace mode execution - uint32_t trace_id = 0; - bool trace_captured = false; - for (auto i = 0; i < num_loops; i++) { - EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); - - if (not trace_captured) { - trace_id = BeginTraceCapture(this->device_, command_queue.id()); - EnqueueProgram(command_queue, simple_program, false); - EndTraceCapture(this->device_, command_queue.id(), trace_id); - trace_captured = true; - } - - EnqueueTrace(command_queue, trace_id, false); - EnqueueReadBuffer(data_movement_queue, *output, trace_outputs[i].data(), true); - - // Expect same output across all loops - EXPECT_TRUE(trace_outputs[i] == trace_outputs[0]); - } - - // Done - Finish(command_queue); - ReleaseTrace(this->device_, trace_id); -} - -TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceBenchmark) { - Setup(6144, 2); - auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - - constexpr bool kBlocking = true; - constexpr bool kNonBlocking = false; - vector blocking_flags = {kBlocking, kNonBlocking}; - - // Single Q for data and commands - // Keep this queue in passthrough mode for now - CommandQueue& command_queue = this->device_->command_queue(0); - - auto simple_program = create_simple_unary_program(*input, *output); - vector input_data(input->size() / sizeof(uint32_t), 0); - for (uint32_t i = 0; i < input_data.size(); i++) { - input_data[i] = i; - } - - // Trace mode output - uint32_t num_loops = 10; - vector> trace_outputs; - - for (auto i = 0; i < num_loops; i++) { - trace_outputs.push_back({}); - trace_outputs[i].resize(input_data.size()); - } - - // Eager mode - vector expected_output_data; - vector eager_output_data; - expected_output_data.resize(input_data.size()); - eager_output_data.resize(input_data.size()); - - // Warm up and use the eager blocking run as the expected output - EnqueueWriteBuffer(command_queue, *input, input_data.data(), kBlocking); - EnqueueProgram(command_queue, simple_program, kBlocking); - EnqueueReadBuffer(command_queue, *output, expected_output_data.data(), kBlocking); - Finish(command_queue); - - for (bool blocking : blocking_flags) { - std::string mode = blocking ? "Eager-B" : "Eager-NB"; - for (auto i = 0; i < num_loops; i++) { - tt::ScopedTimer timer(mode + " loop " + std::to_string(i)); - EnqueueWriteBuffer(command_queue, *input, input_data.data(), blocking); - EnqueueProgram(command_queue, simple_program, blocking); - EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), blocking); - } - if (not blocking) { - // (Optional) wait for the last non-blocking command to finish - Finish(command_queue); - } - EXPECT_TRUE(eager_output_data == expected_output_data); - } - - // Capture trace on a trace queue - uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); - EnqueueProgram(command_queue, simple_program, false); - EndTraceCapture(this->device_, command_queue.id(), tid); - - // Trace mode execution - for (auto i = 0; i < num_loops; i++) { - tt::ScopedTimer timer("Trace loop " + std::to_string(i)); - EnqueueWriteBuffer(command_queue, *input, input_data.data(), kNonBlocking); - EnqueueTrace(command_queue, tid, kNonBlocking); - EnqueueReadBuffer(command_queue, *output, trace_outputs[i].data(), kNonBlocking); - } - Finish(command_queue); - - // Expect same output across all loops - for (auto i = 0; i < num_loops; i++) { - EXPECT_TRUE(trace_outputs[i] == trace_outputs[0]); - } - ReleaseTrace(this->device_, tid); -} - -} // end namespace basic_tests diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp deleted file mode 100644 index 6932ab11955..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp +++ /dev/null @@ -1,355 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "command_queue_fixture.hpp" -#include "command_queue_test_utils.hpp" -#include "gtest/gtest.h" -#include "tt_metal/host_api.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/impl/device/device.hpp" - -using std::vector; -using namespace tt::tt_metal; - - -namespace local_test_functions { - -bool test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(Device* device, vector>& cqs, const TestBufferConfig& config) { - bool pass = true; - for (const bool use_void_star_api: {true, false}) { - - size_t buf_size = config.num_pages * config.page_size; - std::vector> buffers; - std::vector> srcs; - for (uint i = 0; i < cqs.size(); i++) { - buffers.push_back(Buffer::create(device, buf_size, config.page_size, config.buftype)); - srcs.push_back(generate_arange_vector(buffers[i]->size())); - if (use_void_star_api) { - EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i].data(), false); - } else { - EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i], false); - } - } - - for (uint i = 0; i < cqs.size(); i++) { - std::vector result; - if (use_void_star_api) { - result.resize(buf_size / sizeof(uint32_t)); - EnqueueReadBuffer(cqs[i], *buffers[i], result.data(), true); - } else { - EnqueueReadBuffer(cqs[i], *buffers[i], result, true); - } - bool local_pass = (srcs[i] == result); - pass &= local_pass; - } - } - - return pass; -} -} - - -namespace basic_tests { -namespace dram_tests { - -TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToDramBank0) { - TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM}; - for (Device *device : devices_) { - tt::log_info("Running On Device {}", device->id()); - CommandQueue& a = device->command_queue(0); - CommandQueue& b = device->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); - } - -} - -TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToAllDramBanks) { - for (Device *device : devices_) { - tt::log_info("Running On Device {}", device->id()); - TestBufferConfig config = { - .num_pages = uint32_t(device->num_banks(BufferType::DRAM)), - .page_size = 2048, - .buftype = BufferType::DRAM}; - - CommandQueue& a = device->command_queue(0); - CommandQueue& b = device->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); - } -} - -TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) { - constexpr uint32_t num_round_robins = 2; - for (Device *device : devices_) { - tt::log_info("Running On Device {}", device->id()); - TestBufferConfig config = { - .num_pages = num_round_robins * (device->num_banks(BufferType::DRAM)), - .page_size = 2048, - .buftype = BufferType::DRAM}; - - CommandQueue& a = device->command_queue(0); - CommandQueue& b = device->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); - } -} - -TEST_F(MultiCommandQueueMultiDeviceFixture, Sending131072Pages) { - // Was a failing case where we used to accidentally program cb num pages to be total - // pages instead of cb num pages. - TestBufferConfig config = { - .num_pages = 131072, - .page_size = 128, - .buftype = BufferType::DRAM}; - for (Device *device : devices_) { - tt::log_info("Running On Device {}", device->id()); - CommandQueue& a = device->command_queue(0); - CommandQueue& b = device->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); - } -} - -TEST_F(MultiCommandQueueMultiDeviceFixture, TestNon32BAlignedPageSizeForDram) { - for (Device *device : devices_) { - tt::log_info("Running On Device {}", device->id()); - TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM}; - - CommandQueue& a = device->command_queue(0); - CommandQueue& b = device->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); - } -} - -TEST_F(MultiCommandQueueMultiDeviceFixture, TestNon32BAlignedPageSizeForDram2) { - for (Device *device : devices_) { - tt::log_info("Running On Device {}", device->id()); - // From stable diffusion read buffer - TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM}; - - CommandQueue& a = device->command_queue(0); - CommandQueue& b = device->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); - } -} - -TEST_F(MultiCommandQueueMultiDeviceFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) { - for (Device *device : devices_) { - tt::log_info("Running On Device {}", device->id()); - uint32_t page_size = 2048; - uint32_t command_queue_size = device->sysmem_manager().get_cq_size(); - uint32_t num_pages = command_queue_size / page_size; - - TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM}; - - CommandQueue& a = device->command_queue(0); - CommandQueue& b = device->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); - } -} - -TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToDramBank0) { - TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM}; - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - -TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllDramBanks) { - TestBufferConfig config = { - .num_pages = uint32_t(this->device_->num_banks(BufferType::DRAM)), - .page_size = 2048, - .buftype = BufferType::DRAM}; - - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - -TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) { - constexpr uint32_t num_round_robins = 2; - TestBufferConfig config = { - .num_pages = num_round_robins * (this->device_->num_banks(BufferType::DRAM)), - .page_size = 2048, - .buftype = BufferType::DRAM}; - - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - -TEST_F(MultiCommandQueueSingleDeviceFixture, Sending131072Pages) { - // Was a failing case where we used to accidentally program cb num pages to be total - // pages instead of cb num pages. - TestBufferConfig config = { - .num_pages = 131072, - .page_size = 128, - .buftype = BufferType::DRAM}; - - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - -TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForDram) { - TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM}; - - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - -TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForDram2) { - // From stable diffusion read buffer - TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM}; - - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - -TEST_F(MultiCommandQueueSingleDeviceFixture, TestPageSizeTooLarge) { - if (this->arch_ == tt::ARCH::WORMHOLE_B0) { - GTEST_SKIP(); // This test hanging on wormhole b0 - } - // Should throw a host error due to the page size not fitting in the consumer CB - TestBufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM}; - - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_ANY_THROW(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - -TEST_F(MultiCommandQueueSingleDeviceFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) { - uint32_t page_size = 2048; - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id()); - uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), channel); - uint32_t num_pages = command_queue_size / page_size; - - TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM}; - - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - - -} // end namespace dram_tests - -namespace l1_tests { - -TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToL1Bank0) { - TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1}; - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - -TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllL1Banks) { - auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size(); - TestBufferConfig config = { - .num_pages = uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y), - .page_size = 2048, - .buftype = BufferType::L1}; - - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - -TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) { - auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size(); - TestBufferConfig config = { - .num_pages = 2 * uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y), - .page_size = 2048, - .buftype = BufferType::L1}; - - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - -TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForL1) { - TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1}; - - CommandQueue& a = this->device_->command_queue(0); - CommandQueue& b = this->device_->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config)); -} - -TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToL1Bank0) { - for (Device *device : devices_) { - tt::log_info("Running On Device {}", device->id()); - TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1}; - CommandQueue& a = device->command_queue(0); - CommandQueue& b = device->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); - } -} - -TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToAllL1Banks) { - for (Device *device : devices_) { - tt::log_info("Running On Device {}", device->id()); - auto compute_with_storage_grid = device->compute_with_storage_grid_size(); - TestBufferConfig config = { - .num_pages = uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y), - .page_size = 2048, - .buftype = BufferType::L1}; - - CommandQueue& a = device->command_queue(0); - CommandQueue& b = device->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); - } -} - -TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) { - for (Device *device : devices_) { - tt::log_info("Running On Device {}", device->id()); - auto compute_with_storage_grid = device->compute_with_storage_grid_size(); - TestBufferConfig config = { - .num_pages = 2 * uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y), - .page_size = 2048, - .buftype = BufferType::L1}; - - CommandQueue& a = device->command_queue(0); - CommandQueue& b = device->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); - } -} - -TEST_F(MultiCommandQueueMultiDeviceFixture, TestNon32BAlignedPageSizeForL1) { - for (Device *device : devices_) { - tt::log_info("Running On Device {}", device->id()); - TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1}; - - CommandQueue& a = device->command_queue(0); - CommandQueue& b = device->command_queue(1); - vector> cqs = {a, b}; - EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config)); - } -} - -} // end namespace l1_tests -} // end namespace basic_tests diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp deleted file mode 100644 index b3efb0e4f16..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp +++ /dev/null @@ -1,120 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "gtest/gtest.h" -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/test_utils/env_vars.hpp" -#include "tt_metal/impl/dispatch/command_queue.hpp" -#include "tt_metal/llrt/rtoptions.hpp" - -using namespace tt::tt_metal; - -class MultiCommandQueueSingleDeviceFixture : public ::testing::Test { - protected: - void SetUp() override { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch) { - TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); - GTEST_SKIP(); - } - auto num_cqs = tt::llrt::OptionsG.get_num_hw_cqs(); - if (num_cqs != 2) { - TT_THROW("This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2"); - GTEST_SKIP(); - } - arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER; - if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) { - if (!tt::tt_metal::IsGalaxyCluster()) { - tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()"); - dispatch_core_type = DispatchCoreType::ETH; - } - } - device_ = tt::tt_metal::CreateDevice(0, num_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - } - - void TearDown() override { - tt::tt_metal::CloseDevice(device_); - } - - tt::tt_metal::Device* device_; - tt::ARCH arch_; -}; - -class MultiCommandQueueMultiDeviceFixture : public ::testing::Test { - protected: - void SetUp() override { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch) { - TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); - GTEST_SKIP(); - } - auto num_cqs = tt::llrt::OptionsG.get_num_hw_cqs(); - if (num_cqs != 2) { - TT_THROW("This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2"); - GTEST_SKIP(); - } - arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - - - DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER; - if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) { - if (!tt::tt_metal::IsGalaxyCluster()) { - tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()"); - dispatch_core_type = DispatchCoreType::ETH; - } - } - - const chip_id_t mmio_device_id = 0; - reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id}, num_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type); - for (const auto &[id, device] : reserved_devices_) { - devices_.push_back(device); - } - - num_devices_ = reserved_devices_.size(); - } - - void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); } - - std::vector devices_; - std::map reserved_devices_; - size_t num_devices_; - tt::ARCH arch_; -}; - - -class SingleDeviceTraceFixture: public ::testing::Test { -protected: - Device* device_; - tt::ARCH arch_; - - void Setup(const size_t buffer_size, const uint8_t num_hw_cqs = 1) { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - if (slow_dispatch) { - tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset"); - GTEST_SKIP(); - } - if (num_hw_cqs > 1) { - // Running multi-CQ test. User must set this explicitly. - auto num_cqs = getenv("TT_METAL_GTEST_NUM_HW_CQS"); - if (num_cqs == nullptr or strcmp(num_cqs, "2")) { - TT_THROW("This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2"); - GTEST_SKIP(); - } - } - this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - const int device_id = 0; - this->device_ = tt::tt_metal::CreateDevice(device_id, num_hw_cqs, 0, buffer_size);; - } - - void TearDown() override { - if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")) { - tt::tt_metal::CloseDevice(this->device_); - } - } - -}; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp deleted file mode 100644 index e1e02ae6e16..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp +++ /dev/null @@ -1,38 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "tt_metal/host_api.hpp" -#include "tt_metal/common/bfloat16.hpp" - -struct TestBufferConfig { - uint32_t num_pages; - uint32_t page_size; - BufferType buftype; -}; - -struct BufferStressTestConfig { - // Used for normal write/read tests - uint32_t seed; - uint32_t num_pages_total; - - uint32_t page_size; - uint32_t max_num_pages_per_buffer; - - // Used for wrap test - uint32_t num_iterations; - uint32_t num_unique_vectors; -}; - - -inline std::vector generate_arange_vector(uint32_t size_bytes, uint32_t start = 0) { - TT_FATAL(size_bytes % sizeof(uint32_t) == 0, "Error"); - std::vector src(size_bytes / sizeof(uint32_t), 0); - - for (uint32_t i = 0; i < src.size(); i++) { - src.at(i) = start + i; - } - return src; -} diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/tests_main.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/tests_main.cpp deleted file mode 100644 index 1e42f41a46c..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/tests_main.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "gtest/gtest.h" diff --git a/tests/tt_metal/tt_metal/unit_tests_frequent/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_frequent/CMakeLists.txt deleted file mode 100644 index 8cdca979930..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_frequent/CMakeLists.txt +++ /dev/null @@ -1,26 +0,0 @@ -set(UNIT_TESTS_FREQUENT_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/tests/run_many_times.cpp) - -add_executable(unit_tests_frequent ${UNIT_TESTS_FREQUENT_SRCS}) - -target_link_libraries( - unit_tests_frequent - PUBLIC - test_metal_common_libs - gtest - gtest_main -) -target_include_directories( - unit_tests_frequent - PRIVATE - ${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/tt_metal - ${PROJECT_SOURCE_DIR}/tests - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/common -) -set_target_properties( - unit_tests_frequent - PROPERTIES - RUNTIME_OUTPUT_DIRECTORY - ${PROJECT_BINARY_DIR}/test/tt_metal -) diff --git a/tests/tt_metal/tt_metal/unit_tests_frequent/tests_main.cpp b/tests/tt_metal/tt_metal/unit_tests_frequent/tests_main.cpp deleted file mode 100644 index 1e42f41a46c..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_frequent/tests_main.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "gtest/gtest.h" diff --git a/tests/ttnn/unit_tests/gtests/test_add.cpp b/tests/ttnn/unit_tests/gtests/test_add.cpp index c1be54118a6..7b0be8728ac 100644 --- a/tests/ttnn/unit_tests/gtests/test_add.cpp +++ b/tests/ttnn/unit_tests/gtests/test_add.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp" #include "ttnn/device.hpp" #include "ttnn/operations/eltwise/binary/binary.hpp" #include "ttnn/operations/core/core.hpp" diff --git a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp index 311639585bc..a03ed11549e 100644 --- a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp +++ b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp @@ -4,7 +4,7 @@ #include "gtest/gtest.h" #include "tt_metal/common/logger.hpp" -#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp" +#include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp" #include "ttnn/device.hpp" #include "ttnn/operations/eltwise/binary/binary.hpp" #include "ttnn/operations/core/core.hpp"