Skip to content

Commit

Permalink
[NFC] Cleanup code (#50)
Browse files Browse the repository at this point in the history
Run clang-format on Kitsune-specific files. Remove trailing whitespace
and excess newlines from elsewhere.

Co-authored-by: Tarun Prabhu <tarun.prabhu@gmail.com>
  • Loading branch information
tarunprabhu and Tarun Prabhu authored Aug 29, 2024
1 parent 49ec8ad commit 0e6fd93
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 70 deletions.
2 changes: 1 addition & 1 deletion clang/lib/Driver/ToolChain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1716,7 +1716,7 @@ void ToolChain::AddKitsuneLinkerArgs(const ArgList &Args,
case llvm::TapirTargetID::Hip:
CmdArgs.push_back(
Args.MakeArgString(StringRef("-L") + KITSUNE_HIP_LIBRARY_DIR));
ExtractArgsFromString("-lamdhip64", CmdArgs, Args);
ExtractArgsFromString("-lamdhip64", CmdArgs, Args);
ExtractArgsFromString(KITSUNE_HIP_EXTRA_LINKER_FLAGS, CmdArgs, Args);
break;

Expand Down
3 changes: 1 addition & 2 deletions kitsune/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ if (KITSUNE_HIP_ENABLE)
if (DEFINED $ENV{ROCM_PATH})
message(STATUS " using ROCM_PATH environment variable.")
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "AMD ROCM/HIP installation directory.")
else()
else()
message(STATUS " selecting a common default install location.")
set(ROCM_PATH "/opt/rocm" CACHE PATH "AMD ROCM/HIP installation directory.")
endif()
Expand Down Expand Up @@ -351,7 +351,6 @@ if (KITSUNE_OPENCILK_ENABLE)
CACHE STRING
"Additional C++ compiler flags needed to build Kokkos. These are only used when building Kitsune")


# We pass some LLVM_* variables to make Cheetah behave as if it were an
# in-tree build.
set(KITSUNE_CHEETAH_SOURCE_DIR ${KITSUNE_TARGETS_BINARY_DIR}/cheetah/cheetah)
Expand Down
13 changes: 6 additions & 7 deletions kitsune/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ target_include_directories(${KITRT}
# set_property(TARGET ${KITRT} APPEND PROPERTY
# BUILD_RPATH "$ORIGIN/../lib${LLVM_LIBDIR_SUFFIX}:${PROJECT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}")


find_library(LIB_LLVM
NAMES LLVM
LLVM-${LLVM_VERSION_MAJOR}
Expand Down Expand Up @@ -140,21 +139,21 @@ endif()
# kitsune/CMakeLists.txt). They are passed in via CMake because the Kitsune
# runtime is treated as an "ExternalProject".
#
# Note on AMD HIP CMake details... When is HIP not HIP? When it is HIP...
# Note on AMD HIP CMake details... When is HIP not HIP? When it is HIP...
# It is easy to confuse requirements for HIP as the HIP "language" vs. the
# HIP runtime. The language has tangled dependencies within a LLVM context
# as they build around LLVM and Clang too. Using some of the 'hip::host'
# as they build around LLVM and Clang too. Using some of the 'hip::host'
# targets are actually intended for the language and not runtime libraries.
# Unfortunately, these details seem to be a moving target with ecah release
# of rocm/hip...
# Unfortunately, these details seem to be a moving target with ecah release
# of rocm/hip...
if (KITSUNE_HIP_ENABLE)

if (NOT DEFINED ROCM_PATH)
message(STATUS "Kitsune HIP support enabled but ROCM_PATH not set.")
if (DEFINED $ENV{ROCM_PATH})
message(STATUS " using ROCM_PATH environment variable.")
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "AMD ROCM/HIP installation directory.")
else()
else()
message(STATUS " selecting a common default install location.")
set(ROCM_PATH "/opt/rocm" CACHE PATH "AMD ROCM/HIP installation directory.")
endif()
Expand All @@ -177,7 +176,7 @@ if (KITSUNE_HIP_ENABLE)

message(STATUS "hip include dir: ${hip_INCLUDE_DIR}")
message(STATUS "hip library dir: ${hip_LIB_INSTALL_DIR}")

target_compile_definitions(${KITRT} PUBLIC KITRT_HIP_ENABLED)
target_compile_definitions(${KITRT} PUBLIC __HIP_PLATFORM_AMD__)
target_include_directories(${KITRT} PUBLIC ${hip_INCLUDE_DIR})
Expand Down
22 changes: 11 additions & 11 deletions llvm/lib/Transforms/Tapir/CudaABI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ CudaLoop::CudaLoop(Module &M, Module &KernelModule, const std::string &KN,
Int32Ty, // threads-per-block
KernelInstMixTy->getPointerTo(), // instruction mix info
VoidPtrTy); // opaque cuda stream

KitCudaMemPrefetchFn =
M.getOrInsertFunction("__kitcuda_mem_gpu_prefetch",
VoidPtrTy, // return an opaque stream
Expand Down Expand Up @@ -986,14 +986,14 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
// supported) we can end up with multiple calls to the outlined
// loop (which has been setup for dead code elimination) but can
// cause invalid IR that trips us up when handling the GPU module
// code generation. This is a challenge in the Tapir design that
// was not geared to handle some of the nuances of GPU target
// transformations (and code gen). To address this, we need to
// code generation. This is a challenge in the Tapir design that
// was not geared to handle some of the nuances of GPU target
// transformations (and code gen). To address this, we need to
// do some clean up to keep the IR correct (or the verifier will
// fail on us...). Specifically, we can no longer depend upon
// fail on us...). Specifically, we can no longer depend upon
// DCE as it runs too late in the GPU transformation process...
//
// TODO: This code can be shared between the cuda and hip targets...
// TODO: This code can be shared between the cuda and hip targets...
//
Function *TargetKF = KernelModule.getFunction(KernelName);
std::list<Instruction *> RemoveList;
Expand Down Expand Up @@ -1029,7 +1029,7 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
IRBuilder<> NewBuilder(&NewBB->front());

// TODO: There is some potential here to share this code across both
// the hip and cuda transforms...
// the hip and cuda transforms...
LLVM_DEBUG(dbgs() << "\t*- code gen packing of " << OrderedInputs.size()
<< " kernel args.\n");
PointerType *VoidPtrTy = PointerType::getUnqual(Ctx);
Expand All @@ -1049,8 +1049,8 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
i++;

if (CodeGenPrefetch && V->getType()->isPointerTy()) {
LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #"
<< i << "\n");
LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #" << i
<< "\n");
Value *VoidPP = NewBuilder.CreateBitCast(V, VoidPtrTy);
Value *SPtr = NewBuilder.CreateLoad(VoidPtrTy, CudaStream);
Value *NewSPtr =
Expand Down Expand Up @@ -2005,15 +2005,15 @@ CudaABIOutputFile CudaABI::generatePTX() {
FunctionAnalysisManager fam;
CGSCCAnalysisManager cgam;
ModuleAnalysisManager mam;

PassBuilder pb(PTXTargetMachine, pto);
pb.registerModuleAnalyses(mam);
pb.registerCGSCCAnalyses(cgam);
pb.registerFunctionAnalyses(fam);
pb.registerLoopAnalyses(lam);
PTXTargetMachine->registerPassBuilderCallbacks(pb, false);
pb.crossRegisterProxies(lam, fam, cgam, mam);

ModulePassManager mpm = pb.buildPerModuleDefaultPipeline(optLevel);
mpm.addPass(VerifierPass());
LLVM_DEBUG(dbgs() << "\t\t* module: " << KernelModule.getName() << "\n");
Expand Down
95 changes: 46 additions & 49 deletions llvm/lib/Transforms/Tapir/HipABI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,10 @@
#include "llvm/Transforms/Tapir/Outline.h"
#include "llvm/Transforms/Tapir/TapirGPUUtils.h"
#include "llvm/Transforms/Tapir/TapirLoopInfo.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/TapirUtils.h"
#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Mem2Reg.h"
#include "llvm/Transforms/Utils/TapirUtils.h"

using namespace llvm;

Expand Down Expand Up @@ -249,8 +248,7 @@ cl::opt<ROCmABIVersion> ROCmABITarget(
"hipabi-rocm-abi", cl::init(ROCm_ABI_V4), cl::Hidden,
cl::desc("Select the targeted ROCm ABI version."),
cl::values(clEnumValN(ROCm_ABI_V4, "v4", "Target ROCm version 4 ABI."),
clEnumValN(ROCm_ABI_V5, "v5",
"Target ROCm v. 5 ABI.")));
clEnumValN(ROCm_ABI_V5, "v5", "Target ROCm v. 5 ABI.")));

cl::opt<bool> Use64ElementWavefront(
"hipabi-wavefront64", cl::init(true), cl::Hidden,
Expand Down Expand Up @@ -623,15 +621,17 @@ void HipABI::transformConstants(Function *Fn) {
LLVM_DEBUG(dbgs() << "\t\t\t\tnew store: " << *SI << "\n");
} else if (auto *Call = dyn_cast<CallBase>(U->getUser())) {
unsigned argNo = Call->getArgOperandNo(U);
LLVM_DEBUG(dbgs() << "\t\tpatching callable instruction: " << *Call << "\n");
LLVM_DEBUG(dbgs() << "\t\tpatching callable instruction: " << *Call
<< "\n");
// FIXME: This is not correct! The function operand should be
// checked to see what address space it expects.
Instruction *asCast =
new AddrSpaceCastInst(NewGEP, OldGEP->getType(), "", Call);
Call->setArgOperand(argNo, asCast);
LLVM_DEBUG(dbgs() << "\t\t\t\tnew call: " << *Call << "\n");
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(U->getUser())) {
LLVM_DEBUG(dbgs() << "\t\tpatching gep instruction:\n\t\t\t" << *GEP << "\n");
LLVM_DEBUG(dbgs() << "\t\tpatching gep instruction:\n\t\t\t" << *GEP
<< "\n");
Instruction *asCast =
new AddrSpaceCastInst(NewGEP, OldGEP->getType(), "", GEP);
GEP->setOperand(GEP->getPointerOperandIndex(), asCast);
Expand Down Expand Up @@ -830,24 +830,25 @@ HipLoop::HipLoop(Module &M, Module &KModule, const std::string &Name,
Int64Ty, // number of floating point ops.
Int64Ty); // number of integer ops.

KitHipLaunchFn = M.getOrInsertFunction("__kithip_launch_kernel",
VoidPtrTy, // return an opaque stream
VoidPtrTy, // fat-binary
VoidPtrTy, // kernel name
VoidPtrTy, // arguments
Int64Ty, // trip count
Int32Ty, // threads-per-block
KitHipLaunchFn = M.getOrInsertFunction(
"__kithip_launch_kernel",
VoidPtrTy, // return an opaque stream
VoidPtrTy, // fat-binary
VoidPtrTy, // kernel name
VoidPtrTy, // arguments
Int64Ty, // trip count
Int32Ty, // threads-per-block
KernelInstMixTy->getPointerTo(), // instruction mix info
VoidPtrTy); // opaque cuda stream
VoidPtrTy); // opaque cuda stream

KitHipMemPrefetchFn = M.getOrInsertFunction("__kithip_mem_gpu_prefetch",
VoidPtrTy, // return an opaque stream
VoidPtrTy, // pointer to prefetch
VoidPtrTy); // use opaque stream.
KitHipMemPrefetchFn =
M.getOrInsertFunction("__kithip_mem_gpu_prefetch",
VoidPtrTy, // return an opaque stream
VoidPtrTy, // pointer to prefetch
VoidPtrTy); // use opaque stream.
}

HipLoop::~HipLoop() { /* no-op */
}
HipLoop::~HipLoop() { /* no-op */ }

// TODO: Can we also transform the arguments into a different address space here
// and avoid our use of 'mutate' elsewhere in the code?
Expand Down Expand Up @@ -1315,14 +1316,14 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
// supported) we can end up with multiple calls to the outlined
// loop (which has been setup for dead code elimination) but can
// cause invalid IR that trips us up when handling the GPU module
// code generation. This is a challenge in the Tapir design that
// was not geared to handle some of the nuances of GPU target
// transformations (and code gen). To address this, we need to
// code generation. This is a challenge in the Tapir design that
// was not geared to handle some of the nuances of GPU target
// transformations (and code gen). To address this, we need to
// do some clean up to keep the IR correct (or the verifier will
// fail on us...). Specifically, we can no longer depend upon
// fail on us...). Specifically, we can no longer depend upon
// DCE as it runs too late in the GPU transformation process...
//
// TODO: This code can be shared between the cuda and hip targets...
// TODO: This code can be shared between the cuda and hip targets...
//
Function *TargetKF = KernelModule.getFunction(KernelName);
std::list<Instruction *> RemoveList;
Expand Down Expand Up @@ -1357,7 +1358,7 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
IRBuilder<> NewBuilder(&NewBB->front());

// TODO: There is some potential here to share this code across both
// the hip and cuda transforms...
// the hip and cuda transforms...
LLVM_DEBUG(dbgs() << "\t*- code gen packing of " << OrderedInputs.size()
<< " kernel args.\n");
PointerType *VoidPtrTy = PointerType::getUnqual(Ctx);
Expand All @@ -1377,12 +1378,12 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
i++;

if (CodeGenPrefetch && V->getType()->isPointerTy()) {
LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #"
<< i << "\n");
LLVM_DEBUG(dbgs() << "\t\t- code gen prefetch for kernel arg #" << i
<< "\n");
Value *VoidPP = NewBuilder.CreateBitCast(V, VoidPtrTy);
Value *SPtr = NewBuilder.CreateLoad(VoidPtrTy, HipStream);
Value *NewSPtr =
NewBuilder.CreateCall(KitHipMemPrefetchFn, {VoidPP, SPtr});
Value *NewSPtr =
NewBuilder.CreateCall(KitHipMemPrefetchFn, {VoidPP, SPtr});
if (not StreamAssigned) {
NewBuilder.CreateStore(NewSPtr, HipStream);
StreamAssigned = true;
Expand Down Expand Up @@ -1463,13 +1464,13 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
ConstantInt::get(Int64Ty, InstMix.num_iops));

AllocaInst *AI = NewBuilder.CreateAlloca(KernelInstMixTy);
NewBuilder.CreateStore(InstructionMix, AI);
NewBuilder.CreateStore(InstructionMix, AI);

LLVM_DEBUG(dbgs() << "\t*- code gen kernel launch...\n");
Value *KSPtr = NewBuilder.CreateLoad(VoidPtrTy, HipStream);
CallInst *LaunchStream = NewBuilder.CreateCall(KitHipLaunchFn,
{DummyFBPtr, KNameParam, argsPtr, CastTripCount,
TPBlockValue, AI, KSPtr});
CallInst *LaunchStream = NewBuilder.CreateCall(
KitHipLaunchFn, {DummyFBPtr, KNameParam, argsPtr, CastTripCount,
TPBlockValue, AI, KSPtr});
NewBuilder.CreateStore(LaunchStream, HipStream);
LLVM_DEBUG(dbgs() << "\t\t+- registering launch stream:\n"
<< "\t\t\tcall: " << *LaunchStream << "\n"
Expand Down Expand Up @@ -1514,8 +1515,7 @@ HipABI::HipABI(Module &InputModule)
Int32Ty, // host pointer
Int64Ty, // device pointer
Int64Ty); // number of bytes to copy
KitHipSyncFn = M.getOrInsertFunction("__kithip_sync_thread_stream",
VoidTy,
KitHipSyncFn = M.getOrInsertFunction("__kithip_sync_thread_stream", VoidTy,
VoidPtrTy); // no return, nor parameters
// Build the details we need for the AMDGPU/HIP target.
std::string ArchString = "amdgcn";
Expand Down Expand Up @@ -1575,8 +1575,7 @@ HipABI::HipABI(Module &InputModule)
ROCmModulesLoaded = false;
}

HipABI::~HipABI() { /* no-op */
}
HipABI::~HipABI() { /* no-op */ }

std::unique_ptr<Module> &HipABI::getLibDeviceModule() {

Expand Down Expand Up @@ -1704,7 +1703,7 @@ void HipABI::postProcessFunction(Function &F, bool OutliningTapirLoops) {
for (Value *SR : SyncRegList) {
for (Use &U : SR->uses()) {
if (auto *SyncI = dyn_cast<SyncInst>(U.getUser()))
CallInst::Create(KitHipSyncFn, {HipStream}, "",
CallInst::Create(KitHipSyncFn, {HipStream}, "",
&*SyncI->getSuccessor(0)->begin());
}
}
Expand Down Expand Up @@ -1740,7 +1739,7 @@ void HipABI::finalizeLaunchCalls(Module &M, GlobalVariable *BundleBin) {
AllocaInst *StreamAI = getLaunchStream(SavedLaunchCI);
assert(StreamAI != nullptr && "unexpected null launch stream!");
IRBuilder<> SyncBuilder(CI);
Value *HipStream =
Value *HipStream =
SyncBuilder.CreateLoad(VoidPtrTy, StreamAI, "hipstreamh");
CI->setArgOperand(0, HipStream);
SavedLaunchCI = nullptr;
Expand Down Expand Up @@ -1815,7 +1814,7 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) {
ObjFile->keep();

if (OptLevel > 0) {
if (OptLevel > 3)
if (OptLevel > 3)
OptLevel = 3;
LLVM_DEBUG(dbgs() << "\t- running kernel module optimization passes...\n");
PipelineTuningOptions pto;
Expand All @@ -1831,7 +1830,7 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) {
OptimizationLevel::O2,
OptimizationLevel::O3,
};
OptimizationLevel optLevel = optLevels[OptLevel];
OptimizationLevel optLevel = optLevels[OptLevel];

// From the LLVM docs: Create the analysis managers.
// These must be declared in this order so that they are destroyed in the
Expand Down Expand Up @@ -1860,8 +1859,7 @@ HipABIOutputFile HipABI::createTargetObj(const StringRef &ObjFileName) {

legacy::PassManager PassMgr;
if (AMDTargetMachine->addPassesToEmitFile(PassMgr, ObjFile->os(), nullptr,
CodeGenFileType::ObjectFile,
false))
CodeGenFileType::ObjectFile, false))
report_fatal_error("hipabi: AMDGPU target failed!");

PassMgr.run(KernelModule);
Expand Down Expand Up @@ -1998,11 +1996,10 @@ GlobalVariable *HipABI::embedBundle(HipABIOutputFile &BundleFile) {
StringRef(Bundle->getBufferStart(), Bundle->getBufferSize()),
Bundle->getBufferSize(), Int8Ty);
GlobalVariable *BundleGV;
BundleGV = new GlobalVariable(M, BundleArray->getType(), true,
GlobalValue::PrivateLinkage, BundleArray,
"__hip_fatbin",
nullptr, GlobalVariable::NotThreadLocal);

BundleGV = new GlobalVariable(
M, BundleArray->getType(), true, GlobalValue::PrivateLinkage, BundleArray,
"__hip_fatbin", nullptr, GlobalVariable::NotThreadLocal);

const char *BundleSectionName = ".hip_fatbin";
BundleGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
BundleGV->setSection(BundleSectionName);
Expand Down

0 comments on commit 0e6fd93

Please sign in to comment.