Skip to content

Commit

Permalink
Cherry-pick for IFC mix build and bug-fix (#998 and #1002) (#1000)
Browse files Browse the repository at this point in the history
* IFC mix build (#998)
(cherry picked from commit 7c0f49a)

* Fix gcnArch bug in IFC mix build (#998) (#1002)
(cherry picked from commit bc44e3f)

---------
Signed-off-by: nileshnegi <Nilesh.Negi@amd.com>
Co-authored-by: Bertan Dogancay <Bertan.Dogancay@amd.com>
  • Loading branch information
nileshnegi authored Dec 7, 2023
1 parent 441a7f3 commit 2f6d59e
Show file tree
Hide file tree
Showing 13 changed files with 21 additions and 19 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ option(BUILD_SHARED_LIBS "Build as shared library"
option(BUILD_TESTS "Build unit test programs" OFF)
option(COLLTRACE "Collective Trace Option" ON)
option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" ON)
option(ENABLE_IFC "Enable indirect function call" OFF)
option(ENABLE_IFC "Enable indirect function call" ON)
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
option(PROFILE "Enable profiling" OFF)
option(TIMETRACE "Enable time-trace during compilation" OFF)
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/all_gather.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down
6 changes: 3 additions & 3 deletions src/collectives/device/all_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down Expand Up @@ -223,7 +223,7 @@ namespace {
}

template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runTreeUpDown(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) {
Expand Down Expand Up @@ -379,7 +379,7 @@ namespace {
}

template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runTreeSplit(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) {
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/alltoall_pivot.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/broadcast.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down
8 changes: 4 additions & 4 deletions src/collectives/device/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
static_assert(FUNC_INDEX_P2P == 5410, "Wrong P2P function index");
static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 5411, "Wrong AllToAllPivot function index");

#ifndef USE_INDIRECT_FUNCTION_CALL
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
template<unsigned short f, unsigned short l, bool u>
struct Caller {
static __forceinline__ __device__ __host__
Expand Down Expand Up @@ -274,7 +274,7 @@ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction {
public:
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
#else
__device__ void run(struct ncclWorkElem* args) {}
Expand Down Expand Up @@ -568,7 +568,7 @@ __forceinline__ __device__ void ncclKernel(
if (ncclShmem.work.header.funcIndex == FnIndex) {
RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
} else {
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
ncclFuncs[ncclShmem.work.header.funcIndex]();
#else
#ifdef ENABLE_LL128
Expand Down Expand Up @@ -627,7 +627,7 @@ __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDev
// Examples : AllReduce, RING, LL, Sum, uint8
/* Functions for aggregation case */

#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
__device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
Expand Down
4 changes: 2 additions & 2 deletions src/collectives/device/onerank_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

namespace {
template<typename T, typename RedOp>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void oneRankReduce() {
#else
__device__ __attribute__((noinline)) void oneRankReduce() {
Expand Down Expand Up @@ -48,7 +48,7 @@ namespace {
}
}

#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#define INSTANTIATE(devredop, type) \
__device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
oneRankReduce<type, Func##devredop<type>>(); \
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/op128.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ union alignas(16) BytePack<16> {
uint32_t u32[4];
uint64_t u64[2];
ulong2 ul2, native;
#ifndef USE_INDIRECT_FUNCTION_CALL
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
inline __device__ BytePack<16>& operator=(BytePack<16> other) {
u64[0] = other.u64[0];
u64[1] = other.u64[1];
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/reduce_scatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
Expand Down
2 changes: 1 addition & 1 deletion src/collectives/device/sendrecv.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
}
}

#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void run(ncclWork *work) {
#else
__device__ __attribute__((noinline)) void run(ncclWork *work) {
Expand Down
2 changes: 1 addition & 1 deletion src/include/collectives.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ struct ncclDevRedOpFull {
nccl##func##algo##proto

/* Declare all collective operations */
#ifdef USE_INDIRECT_FUNCTION_CALL
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#define DECL5(func, algo, proto, devredop, type) \
extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
Expand Down
4 changes: 3 additions & 1 deletion src/init.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1688,6 +1688,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
int* parentRanks = NULL;
int cudaArch;
int64_t stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
hipDeviceProp_t devProp;

CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail);
Expand All @@ -1698,7 +1699,8 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
// Set the maximum kernel stack size of all kernels to avoid
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
#ifdef USE_INDIRECT_FUNCTION_CALL
if (stackSize > 0 && ncclParamSetStackSize() == 1) {
CUDACHECK(hipGetDeviceProperties(&devProp, 0));
if (stackSize > 0 && ncclParamSetStackSize() == 1 && strcmp(devProp.gcnArchName,"gfx940") != 0 && strcmp(devProp.gcnArchName, "gfx941") != 0 && strcmp(devProp.gcnArchName, "gfx942") != 0) {
INFO(NCCL_INIT, "Setting cudaLimitStackSize to %zi maxLocalSizeBytes %zi", stackSize, maxLocalSizeBytes);
CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, stackSize));
}
Expand Down

0 comments on commit 2f6d59e

Please sign in to comment.