addcuda2

parallel101 · Sep 5, 2024 · 8c890ba · 8c890ba
1 parent ebaa3de
commit 8c890ba
Show file tree

Hide file tree

Showing 8 changed files with 89 additions and 21 deletions.
diff --git a/slides/moderncuda/README.md b/slides/moderncuda/README.md
@@ -1,6 +1,9 @@
 # 现代 C++ 的 CUDA 编程
 
-参考资料：https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
+参考资料：
+
+- https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
+- https://www.cs.sfu.ca/~ashriram/Courses/CS431/assets/lectures/Part8/GPU1.pdf
 
 ## 配置 CUDA 开发环境
 

diff --git a/slides/moderncuda/better_cuda.cuh → slides/moderncuda/cudapp.cuh b/slides/moderncuda/better_cuda.cuh → slides/moderncuda/cudapp.cuh
@@ -3,6 +3,7 @@
 #include <cstddef>
 #include <cstdio>
 #include <cstdlib>
+#include <cstdarg>
 #include <cuda_runtime.h>
 #include <memory>
 #include <new>
@@ -11,7 +12,7 @@
 #include <utility>
 #include <vector>
 
-namespace cupp {
+namespace cudapp {
 
 std::error_category const &cudaErrorCategory() noexcept {
     static struct : std::error_category {
@@ -40,7 +41,7 @@ void throwCudaError(cudaError_t err, char const *file, int line) {
     do { \
         cudaError_t err = (expr); \
         if (err != cudaSuccess) [[unlikely]] { \
-            ::cupp::throwCudaError(err, __FILE__, __LINE__); \
+            ::cudapp::throwCudaError(err, __FILE__, __LINE__); \
         } \
     } while (0)
 
@@ -265,10 +266,22 @@ public:
         }
     };
 
-    void synchronize() const {
+    void join() const {
         CHECK_CUDA(cudaEventSynchronize(*this));
     }
 
+    bool joinReady() const {
+        cudaError_t res = cudaEventQuery(*this);
+        if (res == cudaSuccess) {
+            return true;
+        }
+        if (res == cudaErrorNotReady) {
+            return false;
+        }
+        CHECK_CUDA(res);
+        return false;
+    }
+
     float elapsedMillis(CudaEvent const &event) const {
         float result;
         CHECK_CUDA(cudaEventElapsedTime(&result, *this, event));
@@ -315,10 +328,6 @@ public:
         return CudaStream(nullptr);
     }
 
-    void synchronize() const {
-        CHECK_CUDA(cudaStreamSynchronize(*this));
-    }
-
     void copy(void *dst, void *src, size_t size, cudaMemcpyKind kind) const {
         CHECK_CUDA(cudaMemcpyAsync(dst, src, size, kind, *this));
     }
@@ -348,23 +357,27 @@ public:
         CHECK_CUDA(cudaStreamWaitEvent(*this, event, flags));
     }
 
-    void asyncWait(cudaStreamCallback_t callback, void *userData) const {
+    void join() const {
+        CHECK_CUDA(cudaStreamSynchronize(*this));
+    }
+
+    void joinAsync(cudaStreamCallback_t callback, void *userData) const {
         CHECK_CUDA(cudaStreamAddCallback(*this, callback, userData, 0));
     }
 
     template <class Func>
-    void asyncWait(Func &&func) const {
+    void joinAsync(Func &&func) const {
         auto userData = std::make_unique<Func>();
         cudaStreamCallback_t callback = [](cudaStream_t stream,
                                            cudaError_t status, void *userData) {
             std::unique_ptr<Func> func(static_cast<Func *>(userData));
             (*func)(stream, status);
         };
-        asyncWait(callback, userData.get());
+        joinAsync(callback, userData.get());
         userData.release();
     }
 
-    bool pollWait() {
+    bool joinReady() const {
         cudaError_t res = cudaStreamQuery(*this);
         if (res == cudaSuccess) {
             return true;
@@ -418,7 +431,7 @@ struct CudaAllocator : private Arena {
         if (res == cudaErrorMemoryAllocation) [[unlikely]] {
             throw std::bad_alloc();
         }
-        CHECK_CUDA(("Arena::doMalloc", res));
+        CHECK_CUDA(res /* Arena::doMalloc */);
         return static_cast<T *>(ptr);
     }
 
@@ -459,6 +472,21 @@ struct CudaAllocator : private Arena {
 template <class T>
 using CudaVector = std::vector<T, CudaAllocator<T>>;
 
+#if defined(__clang__) && defined(__CUDACC__) && defined(__GLIBCXX__)
+__host__ __device__ static void printf(const char *fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+#if __CUDA_ARCH__
+    ::vprintf(fmt, (const char *)args);
+#else
+    ::vprintf(fmt, args);
+#endif
+    va_end(args);
+}
+#else
+using ::printf;
+#endif
+
 // #if __cpp_lib_memory_resource
 // template <class Arena>
 // struct CudaResource : std::pmr::memory_resource, private Arena {

diff --git a/slides/moderncuda/example.cu → slides/moderncuda/example/example.cu b/slides/moderncuda/example.cu → slides/moderncuda/example/example.cu
@@ -5,8 +5,7 @@
 }
 
 __device__ void device_func() {
-    auto t = cooperative_groups::this_thread();
-    t.size();
+    std::sin(1);
 }
 
 __host__ __device__ void host_device_func() {

diff --git a/slides/moderncuda/kernel.cu → slides/moderncuda/example/kernel.cu b/slides/moderncuda/kernel.cu → slides/moderncuda/example/kernel.cu
diff --git a/slides/moderncuda/kernel.ptx → slides/moderncuda/example/kernel.ptx b/slides/moderncuda/kernel.ptx → slides/moderncuda/example/kernel.ptx
diff --git a/slides/moderncuda/main.cu b/slides/moderncuda/main.cu
@@ -0,0 +1,33 @@
+#include <cuda_runtime.h>
+#include <nvfunctional>
+#include "cudapp.cuh"
+
+using namespace cudapp;
+
+extern "C" __global__ void kernel(int x) {
+    printf("内核参数 x = %d\n", x);
+    printf("线程编号 (%d, %d)\n", blockIdx.x, threadIdx.x);
+}
+
+int main() {
+    int x = 42;
+    kernel<<<3, 4, 0, 0>>>(x);
+
+    void *args[] = {&x};
+    CHECK_CUDA(cudaLaunchKernel((const void *)kernel, dim3(3), dim3(4), args, 0, 0));
+
+    cudaLaunchConfig_t cfg{};
+    cfg.blockDim = dim3(3);
+    cfg.gridDim = dim3(4);
+    cfg.dynamicSmemBytes = 0;
+    cfg.stream = 0;
+    cfg.attrs = nullptr;
+    cfg.numAttrs = 0;
+    CHECK_CUDA(cudaLaunchKernelEx(&cfg, kernel, x));
+
+    const char *name;
+    CHECK_CUDA(cudaFuncGetName(&name, (const void *)kernel));
+
+    CudaStream::nullStream().join();
+    return 0;
+}
diff --git a/slides/moderncuda/tinybench.cpp b/slides/moderncuda/tinybench.cpp
@@ -1,8 +1,3 @@
+#define TINYBENCH_IMPL_MAIN
 #define TINYBENCH_IMPL
 #include "tinybench.hpp"
-
-[[gnu::weak]] int main() {
-    std::unique_ptr<tinybench::Reporter> rep(tinybench::makeMultipleReporter({tinybench::makeConsoleReporter()}));
-    rep->run_all();
-    return 0;
-}
diff --git a/slides/moderncuda/tinybench.hpp b/slides/moderncuda/tinybench.hpp
@@ -885,3 +885,13 @@ Reporter *makeMultipleReporter(std::vector<Reporter *> const &reporters) {
 
 }
 #endif
+
+#ifdef TINYBENCH_IMPL_MAIN
+#include <memory>
+
+[[gnu::weak]] int main() {
+    std::unique_ptr<tinybench::Reporter> rep(tinybench::makeMultipleReporter({tinybench::makeConsoleReporter()}));
+    rep->run_all();
+    return 0;
+}
+#endif