From 8c890ba00bc452af8d66c419a27998297378679e Mon Sep 17 00:00:00 2001 From: archibate <1931127624@qq.com> Date: Fri, 6 Sep 2024 01:32:27 +0800 Subject: [PATCH] addcuda2 --- slides/moderncuda/README.md | 5 +- .../{better_cuda.cuh => cudapp.cuh} | 52 ++++++++++++++----- slides/moderncuda/{ => example}/example.cu | 3 +- slides/moderncuda/{ => example}/kernel.cu | 0 slides/moderncuda/{ => example}/kernel.ptx | 0 slides/moderncuda/main.cu | 33 ++++++++++++ slides/moderncuda/tinybench.cpp | 7 +-- slides/moderncuda/tinybench.hpp | 10 ++++ 8 files changed, 89 insertions(+), 21 deletions(-) rename slides/moderncuda/{better_cuda.cuh => cudapp.cuh} (92%) rename slides/moderncuda/{ => example}/example.cu (91%) rename slides/moderncuda/{ => example}/kernel.cu (100%) rename slides/moderncuda/{ => example}/kernel.ptx (100%) create mode 100644 slides/moderncuda/main.cu diff --git a/slides/moderncuda/README.md b/slides/moderncuda/README.md index 6f033dc..030d3d6 100644 --- a/slides/moderncuda/README.md +++ b/slides/moderncuda/README.md @@ -1,6 +1,9 @@ # 现代 C++ 的 CUDA 编程 -参考资料:https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html +参考资料: + +- https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html +- https://www.cs.sfu.ca/~ashriram/Courses/CS431/assets/lectures/Part8/GPU1.pdf ## 配置 CUDA 开发环境 diff --git a/slides/moderncuda/better_cuda.cuh b/slides/moderncuda/cudapp.cuh similarity index 92% rename from slides/moderncuda/better_cuda.cuh rename to slides/moderncuda/cudapp.cuh index 1f3622a..4a234ac 100644 --- a/slides/moderncuda/better_cuda.cuh +++ b/slides/moderncuda/cudapp.cuh @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -11,7 +12,7 @@ #include #include -namespace cupp { +namespace cudapp { std::error_category const &cudaErrorCategory() noexcept { static struct : std::error_category { @@ -40,7 +41,7 @@ void throwCudaError(cudaError_t err, char const *file, int line) { do { \ cudaError_t err = (expr); \ if (err != cudaSuccess) [[unlikely]] { \ - ::cupp::throwCudaError(err, __FILE__, __LINE__); \ + ::cudapp::throwCudaError(err, __FILE__, __LINE__); \ } \ } while (0) @@ -265,10 +266,22 @@ public: } }; - void synchronize() const { + void join() const { CHECK_CUDA(cudaEventSynchronize(*this)); } + bool joinReady() const { + cudaError_t res = cudaEventQuery(*this); + if (res == cudaSuccess) { + return true; + } + if (res == cudaErrorNotReady) { + return false; + } + CHECK_CUDA(res); + return false; + } + float elapsedMillis(CudaEvent const &event) const { float result; CHECK_CUDA(cudaEventElapsedTime(&result, *this, event)); @@ -315,10 +328,6 @@ public: return CudaStream(nullptr); } - void synchronize() const { - CHECK_CUDA(cudaStreamSynchronize(*this)); - } - void copy(void *dst, void *src, size_t size, cudaMemcpyKind kind) const { CHECK_CUDA(cudaMemcpyAsync(dst, src, size, kind, *this)); } @@ -348,23 +357,27 @@ public: CHECK_CUDA(cudaStreamWaitEvent(*this, event, flags)); } - void asyncWait(cudaStreamCallback_t callback, void *userData) const { + void join() const { + CHECK_CUDA(cudaStreamSynchronize(*this)); + } + + void joinAsync(cudaStreamCallback_t callback, void *userData) const { CHECK_CUDA(cudaStreamAddCallback(*this, callback, userData, 0)); } template - void asyncWait(Func &&func) const { + void joinAsync(Func &&func) const { auto userData = std::make_unique(); cudaStreamCallback_t callback = [](cudaStream_t stream, cudaError_t status, void *userData) { std::unique_ptr func(static_cast(userData)); (*func)(stream, status); }; - asyncWait(callback, userData.get()); + joinAsync(callback, userData.get()); userData.release(); } - bool pollWait() { + bool joinReady() const { cudaError_t res = cudaStreamQuery(*this); if (res == cudaSuccess) { return true; @@ -418,7 +431,7 @@ struct CudaAllocator : private Arena { if (res == cudaErrorMemoryAllocation) [[unlikely]] { throw std::bad_alloc(); } - CHECK_CUDA(("Arena::doMalloc", res)); + CHECK_CUDA(res /* Arena::doMalloc */); return static_cast(ptr); } @@ -459,6 +472,21 @@ struct CudaAllocator : private Arena { template using CudaVector = std::vector>; +#if defined(__clang__) && defined(__CUDACC__) && defined(__GLIBCXX__) +__host__ __device__ static void printf(const char *fmt, ...) { + va_list args; + va_start(args, fmt); +#if __CUDA_ARCH__ + ::vprintf(fmt, (const char *)args); +#else + ::vprintf(fmt, args); +#endif + va_end(args); +} +#else +using ::printf; +#endif + // #if __cpp_lib_memory_resource // template // struct CudaResource : std::pmr::memory_resource, private Arena { diff --git a/slides/moderncuda/example.cu b/slides/moderncuda/example/example.cu similarity index 91% rename from slides/moderncuda/example.cu rename to slides/moderncuda/example/example.cu index 07fc336..84216d0 100644 --- a/slides/moderncuda/example.cu +++ b/slides/moderncuda/example/example.cu @@ -5,8 +5,7 @@ } __device__ void device_func() { - auto t = cooperative_groups::this_thread(); - t.size(); + std::sin(1); } __host__ __device__ void host_device_func() { diff --git a/slides/moderncuda/kernel.cu b/slides/moderncuda/example/kernel.cu similarity index 100% rename from slides/moderncuda/kernel.cu rename to slides/moderncuda/example/kernel.cu diff --git a/slides/moderncuda/kernel.ptx b/slides/moderncuda/example/kernel.ptx similarity index 100% rename from slides/moderncuda/kernel.ptx rename to slides/moderncuda/example/kernel.ptx diff --git a/slides/moderncuda/main.cu b/slides/moderncuda/main.cu new file mode 100644 index 0000000..e8ce85c --- /dev/null +++ b/slides/moderncuda/main.cu @@ -0,0 +1,33 @@ +#include +#include +#include "cudapp.cuh" + +using namespace cudapp; + +extern "C" __global__ void kernel(int x) { + printf("内核参数 x = %d\n", x); + printf("线程编号 (%d, %d)\n", blockIdx.x, threadIdx.x); +} + +int main() { + int x = 42; + kernel<<<3, 4, 0, 0>>>(x); + + void *args[] = {&x}; + CHECK_CUDA(cudaLaunchKernel((const void *)kernel, dim3(3), dim3(4), args, 0, 0)); + + cudaLaunchConfig_t cfg{}; + cfg.blockDim = dim3(3); + cfg.gridDim = dim3(4); + cfg.dynamicSmemBytes = 0; + cfg.stream = 0; + cfg.attrs = nullptr; + cfg.numAttrs = 0; + CHECK_CUDA(cudaLaunchKernelEx(&cfg, kernel, x)); + + const char *name; + CHECK_CUDA(cudaFuncGetName(&name, (const void *)kernel)); + + CudaStream::nullStream().join(); + return 0; +} diff --git a/slides/moderncuda/tinybench.cpp b/slides/moderncuda/tinybench.cpp index 85fb797..39c6c72 100644 --- a/slides/moderncuda/tinybench.cpp +++ b/slides/moderncuda/tinybench.cpp @@ -1,8 +1,3 @@ +#define TINYBENCH_IMPL_MAIN #define TINYBENCH_IMPL #include "tinybench.hpp" - -[[gnu::weak]] int main() { - std::unique_ptr rep(tinybench::makeMultipleReporter({tinybench::makeConsoleReporter()})); - rep->run_all(); - return 0; -} diff --git a/slides/moderncuda/tinybench.hpp b/slides/moderncuda/tinybench.hpp index f657488..a3df1fa 100644 --- a/slides/moderncuda/tinybench.hpp +++ b/slides/moderncuda/tinybench.hpp @@ -885,3 +885,13 @@ Reporter *makeMultipleReporter(std::vector const &reporters) { } #endif + +#ifdef TINYBENCH_IMPL_MAIN +#include + +[[gnu::weak]] int main() { + std::unique_ptr rep(tinybench::makeMultipleReporter({tinybench::makeConsoleReporter()})); + rep->run_all(); + return 0; +} +#endif