From 1a3c6f023e01c0b7b7495b23f9981a367ecec876 Mon Sep 17 00:00:00 2001 From: Michael Halkenhaeuser Date: Mon, 11 Nov 2024 10:59:40 -0600 Subject: [PATCH] [layer_norm][AMDGPU] Use __builtin_amdgcn_rcpf(x) instead of 1.f/x Replace (more) exact calculation with hardware approximation. Enable via EnVar `PYTORCH_LAYERNORM_FAST_RECIPROCAL=1` (Default: disabled) Benefits: Reduced code size. Improved performance for certain scenarios. Experiments show low reduction in precision. Experiments show no significant performance regressions. bfloat16 as well as float16 related calculations may benefit largely from this change. vectorized_layer_norm_kernel: Gains performance esp. for the following tensor shapes. Lower values for dim1 do not change performance significantly. dim1 = 8k-65k may gain considerable performance, but decline gradually with size. dim0 dim1 ---- ---- 1024 8192 1024 16384 1024 32768 1024 65536 1024 131072 1024 262144 1024 524288 --------- Co-authored-by: Hashem Hashemi --- aten/src/ATen/native/cuda/layer_norm_kernel.cu | 8 ++++++++ cmake/Dependencies.cmake | 9 +++++++++ setup.py | 4 ++++ 3 files changed, 21 insertions(+) diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index f06b247ef32be..468fab5dec91d 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -126,7 +126,11 @@ WelfordDataLN cuWelfordOnlineSum( { U delta = val - curr_sum.mean; U new_count = curr_sum.count + 1.f; +#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL) + U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count); +#else U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster +#endif return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count}; } @@ -140,7 +144,11 @@ WelfordDataLN cuWelfordCombine( U count = dataA.count + dataB.count; U mean, sigma2; if (count > decltype(dataB.count){0}) { +#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL) + auto coef = __builtin_amdgcn_rcpf(count); +#else auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division +#endif auto nA = dataA.count * coef; auto nB = dataB.count * coef; mean = nA*dataA.mean + nB*dataB.mean; diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 504d6ee2243db..34226bc0074bb 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1066,6 +1066,15 @@ if(USE_ROCM) list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling) endif(CMAKE_BUILD_TYPE MATCHES Debug) + set(PYTORCH_LAYERNORM_FAST_RECIPROCAL + $ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL} + CACHE BOOL "Enable fast reciprocals within layer normalization." FORCE + ) + + if(PYTORCH_LAYERNORM_FAST_RECIPROCAL) + add_definitions(-DPYTORCH_LAYERNORM_FAST_RECIPROCAL) + endif() + # needed for compat with newer versions of hip-clang that introduced C++20 mangling rules list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17) diff --git a/setup.py b/setup.py index ea087c356c152..624964b99b3b4 100644 --- a/setup.py +++ b/setup.py @@ -135,6 +135,10 @@ # USE_ROCM_KERNEL_ASSERT=1 # Enable kernel assert in ROCm platform # +# PYTORCH_LAYERNORM_FAST_RECIPROCAL +# Enable (=1) the use of builtin functions for fast reciprocals (1/x) w.r.t. +# layer normalization. Default: disabled. +# # Environment variables we respect (these environment variables are # conventional and are often understood/set by other software.) #