From 8c890ba00bc452af8d66c419a27998297378679e Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Fri, 6 Sep 2024 01:32:27 +0800
Subject: [PATCH] addcuda2

---
 slides/moderncuda/README.md                   |  5 +-
 .../{better_cuda.cuh => cudapp.cuh}           | 52 ++++++++++++++-----
 slides/moderncuda/{ => example}/example.cu    |  3 +-
 slides/moderncuda/{ => example}/kernel.cu     |  0
 slides/moderncuda/{ => example}/kernel.ptx    |  0
 slides/moderncuda/main.cu                     | 33 ++++++++++++
 slides/moderncuda/tinybench.cpp               |  7 +--
 slides/moderncuda/tinybench.hpp               | 10 ++++
 8 files changed, 89 insertions(+), 21 deletions(-)
 rename slides/moderncuda/{better_cuda.cuh => cudapp.cuh} (92%)
 rename slides/moderncuda/{ => example}/example.cu (91%)
 rename slides/moderncuda/{ => example}/kernel.cu (100%)
 rename slides/moderncuda/{ => example}/kernel.ptx (100%)
 create mode 100644 slides/moderncuda/main.cu
diff --git a/slides/moderncuda/README.md b/slides/moderncuda/README.md
index 6f033dc..030d3d6 100644
--- a/slides/moderncuda/README.md
+++ b/slides/moderncuda/README.md
@@ -1,6 +1,9 @@
 # 现代 C++ 的 CUDA 编程
 
-参考资料：https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
+参考资料：
+
+- https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
+- https://www.cs.sfu.ca/~ashriram/Courses/CS431/assets/lectures/Part8/GPU1.pdf
 
 ## 配置 CUDA 开发环境
 
diff --git a/slides/moderncuda/better_cuda.cuh b/slides/moderncuda/cudapp.cuh
similarity index 92%
rename from slides/moderncuda/better_cuda.cuh
rename to slides/moderncuda/cudapp.cuh
index 1f3622a..4a234ac 100644
--- a/slides/moderncuda/better_cuda.cuh
+++ b/slides/moderncuda/cudapp.cuh
@@ -3,6 +3,7 @@
 #include <cstddef>
 #include <cstdio>
 #include <cstdlib>
+#include <cstdarg>
 #include <cuda_runtime.h>
 #include <memory>
 #include <new>
@@ -11,7 +12,7 @@
 #include <utility>
 #include <vector>
 
-namespace cupp {
+namespace cudapp {
 
 std::error_category const &cudaErrorCategory() noexcept {
     static struct : std::error_category {
@@ -40,7 +41,7 @@ void throwCudaError(cudaError_t err, char const *file, int line) {
     do { \
         cudaError_t err = (expr); \
         if (err != cudaSuccess) [[unlikely]] { \
-            ::cupp::throwCudaError(err, __FILE__, __LINE__); \
+            ::cudapp::throwCudaError(err, __FILE__, __LINE__); \
         } \
     } while (0)
 
@@ -265,10 +266,22 @@ public:
         }
     };
 
-    void synchronize() const {
+    void join() const {
         CHECK_CUDA(cudaEventSynchronize(*this));
     }
 
+    bool joinReady() const {
+        cudaError_t res = cudaEventQuery(*this);
+        if (res == cudaSuccess) {
+            return true;
+        }
+        if (res == cudaErrorNotReady) {
+            return false;
+        }
+        CHECK_CUDA(res);
+        return false;
+    }
+
     float elapsedMillis(CudaEvent const &event) const {
         float result;
         CHECK_CUDA(cudaEventElapsedTime(&result, *this, event));
@@ -315,10 +328,6 @@ public:
         return CudaStream(nullptr);
     }
 
-    void synchronize() const {
-        CHECK_CUDA(cudaStreamSynchronize(*this));
-    }
-
     void copy(void *dst, void *src, size_t size, cudaMemcpyKind kind) const {
         CHECK_CUDA(cudaMemcpyAsync(dst, src, size, kind, *this));
     }
@@ -348,23 +357,27 @@ public:
         CHECK_CUDA(cudaStreamWaitEvent(*this, event, flags));
     }
 
-    void asyncWait(cudaStreamCallback_t callback, void *userData) const {
+    void join() const {
+        CHECK_CUDA(cudaStreamSynchronize(*this));
+    }
+
+    void joinAsync(cudaStreamCallback_t callback, void *userData) const {
         CHECK_CUDA(cudaStreamAddCallback(*this, callback, userData, 0));
     }
 
     template <class Func>
-    void asyncWait(Func &&func) const {
+    void joinAsync(Func &&func) const {
         auto userData = std::make_unique<Func>();
         cudaStreamCallback_t callback = [](cudaStream_t stream,
                                            cudaError_t status, void *userData) {
             std::unique_ptr<Func> func(static_cast<Func *>(userData));
             (*func)(stream, status);
         };
-        asyncWait(callback, userData.get());
+        joinAsync(callback, userData.get());
         userData.release();
     }
 
-    bool pollWait() {
+    bool joinReady() const {
         cudaError_t res = cudaStreamQuery(*this);
         if (res == cudaSuccess) {
             return true;
@@ -418,7 +431,7 @@ struct CudaAllocator : private Arena {
         if (res == cudaErrorMemoryAllocation) [[unlikely]] {
             throw std::bad_alloc();
         }
-        CHECK_CUDA(("Arena::doMalloc", res));
+        CHECK_CUDA(res /* Arena::doMalloc */);
         return static_cast<T *>(ptr);
     }
 
@@ -459,6 +472,21 @@ struct CudaAllocator : private Arena {
 template <class T>
 using CudaVector = std::vector<T, CudaAllocator<T>>;
 
+#if defined(__clang__) && defined(__CUDACC__) && defined(__GLIBCXX__)
+__host__ __device__ static void printf(const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+#if __CUDA_ARCH__
+    ::vprintf(fmt, (const char *)args);
+#else
+    ::vprintf(fmt, args);
+#endif
+    va_end(args);
+}
+#else
+using ::printf;
+#endif
+
 // #if __cpp_lib_memory_resource
 // template <class Arena>
 // struct CudaResource : std::pmr::memory_resource, private Arena {
diff --git a/slides/moderncuda/example.cu b/slides/moderncuda/example/example.cu
similarity index 91%
rename from slides/moderncuda/example.cu
rename to slides/moderncuda/example/example.cu
index 07fc336..84216d0 100644
--- a/slides/moderncuda/example.cu
+++ b/slides/moderncuda/example/example.cu
@@ -5,8 +5,7 @@
 }
 
 __device__ void device_func() {
-    auto t = cooperative_groups::this_thread();
-    t.size();
+    std::sin(1);
 }
 
 __host__ __device__ void host_device_func() {
diff --git a/slides/moderncuda/kernel.cu b/slides/moderncuda/example/kernel.cu
similarity index 100%
rename from slides/moderncuda/kernel.cu
rename to slides/moderncuda/example/kernel.cu
diff --git a/slides/moderncuda/kernel.ptx b/slides/moderncuda/example/kernel.ptx
similarity index 100%
rename from slides/moderncuda/kernel.ptx
rename to slides/moderncuda/example/kernel.ptx
diff --git a/slides/moderncuda/main.cu b/slides/moderncuda/main.cu
new file mode 100644
index 0000000..e8ce85c
--- /dev/null
+++ b/slides/moderncuda/main.cu
@@ -0,0 +1,33 @@
+#include <cuda_runtime.h>
+#include <nvfunctional>
+#include "cudapp.cuh"
+
+using namespace cudapp;
+
+extern "C" __global__ void kernel(int x) {
+    printf("内核参数 x = %d\n", x);
+    printf("线程编号 (%d, %d)\n", blockIdx.x, threadIdx.x);
+}
+
+int main() {
+    int x = 42;
+    kernel<<<3, 4, 0, 0>>>(x);
+
+    void *args[] = {&x};
+    CHECK_CUDA(cudaLaunchKernel((const void *)kernel, dim3(3), dim3(4), args, 0, 0));
+
+    cudaLaunchConfig_t cfg{};
+    cfg.blockDim = dim3(3);
+    cfg.gridDim = dim3(4);
+    cfg.dynamicSmemBytes = 0;
+    cfg.stream = 0;
+    cfg.attrs = nullptr;
+    cfg.numAttrs = 0;
+    CHECK_CUDA(cudaLaunchKernelEx(&cfg, kernel, x));
+
+    const char *name;
+    CHECK_CUDA(cudaFuncGetName(&name, (const void *)kernel));
+
+    CudaStream::nullStream().join();
+    return 0;
+}
diff --git a/slides/moderncuda/tinybench.cpp b/slides/moderncuda/tinybench.cpp
index 85fb797..39c6c72 100644
--- a/slides/moderncuda/tinybench.cpp
+++ b/slides/moderncuda/tinybench.cpp
@@ -1,8 +1,3 @@
+#define TINYBENCH_IMPL_MAIN
 #define TINYBENCH_IMPL
 #include "tinybench.hpp"
-
-[[gnu::weak]] int main() {
-    std::unique_ptr<tinybench::Reporter> rep(tinybench::makeMultipleReporter({tinybench::makeConsoleReporter()}));
-    rep->run_all();
-    return 0;
-}
diff --git a/slides/moderncuda/tinybench.hpp b/slides/moderncuda/tinybench.hpp
index f657488..a3df1fa 100644
--- a/slides/moderncuda/tinybench.hpp
+++ b/slides/moderncuda/tinybench.hpp
@@ -885,3 +885,13 @@ Reporter *makeMultipleReporter(std::vector<Reporter *> const &reporters) {
 
 }
 #endif
+
+#ifdef TINYBENCH_IMPL_MAIN
+#include <memory>
+
+[[gnu::weak]] int main() {
+    std::unique_ptr<tinybench::Reporter> rep(tinybench::makeMultipleReporter({tinybench::makeConsoleReporter()}));
+    rep->run_all();
+    return 0;
+}
+#endif