From 187b84bd66b86ab64d26aa536d76f11f8c33fa1e Mon Sep 17 00:00:00 2001 From: Keith Horrocks Date: Fri, 18 Oct 2024 05:12:48 -0700 Subject: [PATCH] Update Various C++ Documentation Examples to Current Interface (#398) * Fix "Your First Kompute (C++)" Example Updated so the example compiles in a test project. With the addition of an Image class various things were reorganized, including algorithm using kp::Memory instead of just kp::Tensor and kp::OpTensorSync* operations renamed to kp::OpSync*. Signed-off-by: Keith Horrocks * Fix TestBenchmark to Compile and Run With the addition of an Image class various things were reorganized, including algorithm using kp::Memory instead of just kp::Tensor and kp::OpTensorSync* operations renamed to kp::OpSync*. Tested building with KOMPUTE_OPT_ENABLE_BENCHMARK="ON" and verifying resulting binary runs. Signed-off-by: Keith Horrocks * Fix "Add Extensions" C++ Example More renames from kp::OpTensorSync* to kp::OpSync* Example now compiles in a test project Signed-off-by: Keith Horrocks * Fix "Your Custom Kompute Operation" C++ Example Various corrections, including renaming kp::OpTensorSync* operations to kp::OpSync*. Example now compiles in a test project. Signed-off-by: Keith Horrocks * Fix "Async/Await Example" in C++ Various corrections, including renaming kp::OpTensorSync* operations to kp::OpSync*. Example now compiles when pieced together in a test project. Signed-off-by: Keith Horrocks * Fix "Parallel Execution Example" in C++ Various corrections, including renaming kp::OpTensorSync* operations to kp::OpSync*. Example now compiles when pieced together in a test project. Signed-off-by: Keith Horrocks --------- Signed-off-by: Keith Horrocks --- README.md | 6 +-- benchmark/TestBenchmark.cpp | 6 +-- docs/overview/advanced-examples.rst | 61 +++++++++++++++-------------- 3 files changed, 37 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 85a22a30..38f357bf 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ void kompute(const std::string& shader) { auto tensorOutA = mgr.tensorT({ 0, 0, 0 }); auto tensorOutB = mgr.tensorT({ 0, 0, 0 }); - std::vector> params = {tensorInA, tensorInB, tensorOutA, tensorOutB}; + std::vector> params = {tensorInA, tensorInB, tensorOutA, tensorOutB}; // 3. Create algorithm based on shader (supports buffers & push/spec constants) kp::Workgroup workgroup({3, 1, 1}); @@ -110,7 +110,7 @@ void kompute(const std::string& shader) { // 4. Run operation synchronously using sequence mgr.sequence() - ->record(params) + ->record(params) ->record(algorithm) // Binds default push consts ->eval() // Evaluates the two recorded operations ->record(algorithm, pushConstsB) // Overrides push consts @@ -118,7 +118,7 @@ void kompute(const std::string& shader) { // 5. Sync results from the GPU asynchronously auto sq = mgr.sequence(); - sq->evalAsync(params); + sq->evalAsync(params); // ... Do other work asynchronously whilst GPU finishes diff --git a/benchmark/TestBenchmark.cpp b/benchmark/TestBenchmark.cpp index c803ab8a..77958608 100644 --- a/benchmark/TestBenchmark.cpp +++ b/benchmark/TestBenchmark.cpp @@ -48,7 +48,7 @@ TEST(TestBenchmark, TestMultipleSequenceOperationMostlyGPU) std::shared_ptr> tensorInB = mgr.tensorT(std::vector(numElems, elemValue)); std::shared_ptr> tensorOut = mgr.tensor(std::vector(numElems, 0)); - std::vector> params = { tensorInA, tensorInB, tensorOut }; + std::vector> params = { tensorInA, tensorInB, tensorOut }; // Opt: Avoiding using anonimous sequences when we will reuse std::vector> sequences(numSeqs); @@ -63,7 +63,7 @@ TEST(TestBenchmark, TestMultipleSequenceOperationMostlyGPU) } } - mgr.sequence()->eval({ tensorInA }); + mgr.sequence()->eval({ tensorInA }); auto startTime = std::chrono::high_resolution_clock::now(); @@ -83,7 +83,7 @@ TEST(TestBenchmark, TestMultipleSequenceOperationMostlyGPU) std::chrono::duration_cast(endTime - startTime) .count(); - mgr.sequence()->eval({ tensorOut }); + mgr.sequence()->eval({ tensorOut }); EXPECT_EQ(tensorOut->vector(), std::vector(numElems, elemValue * numIter * numOps * numSeqs)); diff --git a/docs/overview/advanced-examples.rst b/docs/overview/advanced-examples.rst index ee44c582..8c8407f5 100644 --- a/docs/overview/advanced-examples.rst +++ b/docs/overview/advanced-examples.rst @@ -69,12 +69,12 @@ The example below shows how you can enable the "VK_EXT_shader_atomic_float" exte mgr.algorithm({ tensor }, spirv, kp::Workgroup({ 1 }), {}, { 0.0, 0.0, 0.0 }); sq = mgr.sequence() - ->record({ tensor }) + ->record({ tensor }) ->record(algo, std::vector{ 0.1, 0.2, 0.3 }) ->record(algo, std::vector{ 0.3, 0.2, 0.1 }) - ->record({ tensor }) + ->record({ tensor }) ->eval(); EXPECT_EQ(tensor->data(), std::vector({ 0.4, 0.4, 0.4 })); @@ -92,12 +92,12 @@ We also provide tools that allow you to `convert shaders into C++ headers > tensors, + OpMyCustom(std::vector> tensors, std::shared_ptr algorithm) - : OpAlgoBase(algorithm) + : kp::OpAlgoDispatch(algorithm) { if (tensors.size() != 3) { throw std::runtime_error("Kompute OpMult expected 3 tensors but got " + tensors.size()); @@ -135,7 +135,7 @@ We also provide tools that allow you to `convert shaders into C++ headers rebuild(tensors, spirv); } - } + }; int main() { @@ -148,13 +148,13 @@ We also provide tools that allow you to `convert shaders into C++ headers record({tensorLhs, tensorRhs, tensorOut}) - ->record({tensorLhs, tensorRhs, tensorOut}, mgr.algorithm()) - ->record({tensorLhs, tensorRhs, tensorOut}) + ->record({tensorLhs, tensorRhs, tensorOut}) + ->record({tensorLhs, tensorRhs, tensorOut}, mgr.algorithm()) + ->record({tensorLhs, tensorRhs, tensorOut}) ->eval(); // Prints the output which is { 0, 4, 12 } - std::cout << fmt::format("Output: {}", tensorOutput.data()) << std::endl; + std::cout << fmt::format("Output: {}", tensorOut->vector()) << std::endl; } Async/Await Example @@ -170,8 +170,8 @@ First we are able to create the manager as we normally would. // You can allow Kompute to create the GPU resources, or pass your existing ones kp::Manager mgr; // Selects device 0 unless explicitly requested - // Creates tensor an initializes GPU memory (below we show more granularity) - auto tensor = mgr.tensor(10, 0.0); + // Creates tensor and initializes GPU memory (below we show more granularity) + auto tensor = mgr.tensorT(10); We can now run our first asynchronous command, which in this case we can use the default sequence. @@ -181,7 +181,7 @@ Sequences can be executed in synchronously or asynchronously without having to c :linenos: // Create tensors data explicitly in GPU with an operation - mgr.sequence()->eval({tensor}); + mgr.sequence()->eval({tensor}); While this is running we can actually do other things like in this case create the shader we'll be using. @@ -231,7 +231,7 @@ The parameter provided is the maximum amount of time to wait in nanoseconds. Whe .. code-block:: cpp :linenos: - auto sq = mgr.sequence() + auto sq = mgr.sequence(); // Run Async Kompute operation on the parameters provided sq->evalAsync(algo); @@ -240,7 +240,7 @@ The parameter provided is the maximum amount of time to wait in nanoseconds. Whe // When we're ready we can wait // The default wait time is UINT64_MAX - sq.evalAwait() + sq->evalAwait(); Finally, below you can see that we can also run syncrhonous commands without having to change anything. @@ -250,11 +250,11 @@ Finally, below you can see that we can also run syncrhonous commands without hav // Sync the GPU memory back to the local tensor // We can still run synchronous jobs in our created sequence - sq.eval({ tensor }); + sq->eval({ tensor }); // Prints the output: B: { 100000000, ... } std::cout << fmt::format("B: {}", - tensor.data()) << std::endl; + tensor->vector()) << std::endl; Parallel Operation Submission @@ -318,8 +318,8 @@ It's worth mentioning you can have multiple sequences referencing the same queue // We need to create explicit sequences with their respective queues // The second parameter is the index in the familyIndex array which is relative // to the vector we created the manager with. - sqOne = mgr.sequence(0); - sqTwo = mgr.sequence(1); + auto sqOne = mgr.sequence(0); + auto sqTwo = mgr.sequence(1); We create the tensors without modifications. @@ -327,11 +327,11 @@ We create the tensors without modifications. :linenos: // Creates tensor an initializes GPU memory (below we show more granularity) - auto tensorA = mgr.tensor({ 10, 0.0 }); - auto tensorB = mgr.tensor({ 10, 0.0 }); + auto tensorA = mgr.tensorT(10); + auto tensorB = mgr.tensorT(10); // Copies the data into GPU memory - mgr.sequence().eval({tensorA tensorB}); + mgr.sequence()->eval({tensorA, tensorB}); Similar to the asyncrhonous usecase above, we can still run synchronous commands without modifications. @@ -367,7 +367,8 @@ Similar to the asyncrhonous usecase above, we can still run synchronous commands // See shader documentation section for compileSource std::vector spirv = compileSource(shader); - std::shared_ptr algo = mgr.algorithm({tensorA, tenssorB}, spirv); + std::shared_ptr algoOne = mgr.algorithm({ tensorA }, spirv); + std::shared_ptr algoTwo = mgr.algorithm({ tensorB }, spirv); Now we can actually trigger the parallel processing, running two OpAlgoBase Operations - each in a different sequence / queue. @@ -375,15 +376,15 @@ Now we can actually trigger the parallel processing, running two OpAlgoBase Oper :linenos: // Run the first parallel operation in the `queueOne` sequence - sqOne->evalAsync(algo); + sqOne->evalAsync(algoOne); // Run the second parallel operation in the `queueTwo` sequence - sqTwo->evalAsync(algo); + sqTwo->evalAsync(algoTwo); Similar to the asynchronous example above, we are able to do other work whilst the tasks are executing. -We are able to wait for the tasks to complete by triggering the `evalOpAwait` on the respective sequence. +We are able to wait for the tasks to complete by triggering the `evalAwait` on the respective sequence. .. code-block:: cpp :linenos: @@ -391,14 +392,14 @@ We are able to wait for the tasks to complete by triggering the `evalOpAwait` on // Here we can do other work // We can now wait for the two parallel tasks to finish - sqOne.evalOpAwait() - sqTwo.evalOpAwait() + sqOne->evalAwait(); + sqTwo->evalAwait(); // Sync the GPU memory back to the local tensor - mgr.sequence()->eval({ tensorA, tensorB }); + mgr.sequence()->eval({ tensorA, tensorB }); // Prints the output: A: 100000000 B: 100000000 std::cout << fmt::format("A: {}, B: {}", - tensorA.data()[0], tensorB.data()[0]) << std::endl; + tensorA->data()[0], tensorB->data()[0]) << std::endl;