From 8a3b20ec9b0980d7d9e5e8c92ffaeeec424a2cbb Mon Sep 17 00:00:00 2001 From: Slava Savenko Date: Sat, 17 Nov 2018 17:07:59 +0100 Subject: [PATCH] Issue/23 slow vulkan cuda (#24) * improve sync kernel calls latencies. bump version --- CMakeLists.txt | 5 +-- doc/array_usage.md | 51 ++++++++++++++++++++----------- doc/features_not_to_come.md | 2 +- doc/features_to_come.md | 4 +-- readme.md | 2 +- src/include/vuh/arr/arrayView.hpp | 2 +- src/include/vuh/program.hpp | 6 ++-- 7 files changed, 41 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7cb12d1..22127bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.8) -project(vuh VERSION 1.1.0) +project(vuh VERSION 1.1.1) option(VUH_BUILD_BENCHMARKS "Build benchmarks for vuh library" OFF) option(VUH_BUILD_DOCS "Build doxygen documentation for vuh" ON) @@ -7,9 +7,6 @@ option(VUH_BUILD_EXAMPLES "Build examples of using vuh" ON) option(VUH_BUILD_TESTS "Build tests for vuh library" ON) set(CMAKE_CXX_STANDARD 14) -if(UNIX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") -endif() list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/config) enable_testing() diff --git a/doc/array_usage.md b/doc/array_usage.md index a12d34d..ad2a6fd 100644 --- a/doc/array_usage.md +++ b/doc/array_usage.md @@ -1,30 +1,28 @@ # Array usage -At the moment objects of vuh::Array family only provide abstraction over Vulkan storage buffers. -Each vuh::Array object is basically a vkBuffer together with owned memory chunk and some auxiliary data. +The objects of vuh::Array family only provide abstraction over uniform storage buffers. +Each vuh::Array object is basically a vkBuffer together with its own memory chunk and some auxiliary data. Array interface for data exchange with a host system depends on a kind of allocator used. If requested allocation strategy fails the fall-back strategy will be used if such exists. -Naturally the data exchange interface would not change when a fall-back allocator kicks in. -When fall-back options are exhausted and memory is not allocated exception will be thrown. +If a fall-back allocator kicks in the data exchange interface would not change . +In that case some (but not all) operations will still be optimized for the actual type memory in use. +When all fall-back options are exhausted and memory is not allocated exception is be thrown. Exceptions thrown from ```vuh::Array``` are all members of ```vk::Error``` family. -Some (but not all) operations will be optimized still under the cover in that case for the -actual memory allocated. -Thus to get maximum performance it is better to match the memory at hand. -For example on integrated GPUs use of ```vuh::mem::Host``` would be optimal, -while device-local would still work. +To get the maximum performance it is better to match the exact type memory at hand. +For example on integrated GPUs using ```vuh::mem::Host``` would be optimal while device-local still works. In this particular example the same memory would be allocated but result in Array with restricted data exchange interface and potential use of stage buffers for some operations. -Allocation strategy does not make any difference for the purpose of passing Array-s to kernels. -All data transfers operations are blocking (at the moment). +Allocation strategy does not make any difference for the purpose of passing Arrays to computation kernels. Below there is a more detailed description of most useful Allocator options and corresponding Array usage. -## Device (```vuh::mem::Device```) +## Allocations +### Device (```vuh::mem::Device```) Array memory will be requested in device-local memory. The fall-back allocation strategy is ```vuh::mem::Host```. This type of arrays is to be used in kernels, but data transfer to/from host is expected. This is the default allocation strategy, so you can skip typing ```vuh::mem::Device```. Its construction and data transfer interface enables efficient data handling with a potential to avoid extra (staging) copy, handle big transfers in smaller chunks and partial latency hiding. -### Construction and data transfer from hostk +#### Construction and data transfer from host ```cpp const auto ha = std::vector(1024, 3.14f); // host array to initialize from using Array = vuh::Array; // = vuh::Array; @@ -35,7 +33,7 @@ auto array_1 = Array(device, ha); // create array of floats a auto array_2 = Array(device, begin(ha), end(ha)); // same in stl range style auto array_3 = Array(device, 1024, [&](size_t i){return ha[i];}); // create + index-based transform ``` -### Transfer data to host +#### Transfer data to host ```cpp auto ha = std::vector(1024, 3.14f); // host iterable to copy data to auto array = vuh::Array(device, 1024); // device array to copy data from @@ -46,7 +44,7 @@ array.toHost(begin(ha), 512, [](auto x){return x;}); // copy-transforn part the ha = array.toHost>(); // copy the whole device array to host ``` -## Device-Only (```vuh::mem::DeviceOnly```) +### Device-Only (```vuh::mem::DeviceOnly```) ```cpp // create device-only array of 1024 floats auto array = vuh::Array(device, 1024); @@ -59,7 +57,7 @@ Apart from missing data transfer interface it only differs from a normal Device So it may show a bit better performance but most probably wouldn't. In any case it is not worse then that and indicates intended usage so is a useful creature. -## Host (```vuh::mem::Host```) +### Host (```vuh::mem::Host```) For these memory is allocated on a host in a 'pinned' area, so that it is visible to GPU. This is the only kind of memory you can get with integrated GPUs (althogh there it is flagged as device-local, so technically it would be the same as ```vuh::mem::Unified```). @@ -72,7 +70,7 @@ If it fails exception is thrown and no resources get allocated. Its construction and data transfer interface follows that for a standard containers. With an important difference that while it provides random access with operator [], the iterators fall into 'mutable forward' category. -### Construction and data exchange interface +#### Construction and data exchange interface ```cpp auto ha = std::vector(1024, 3.14f); // host array to initialize from using Array = vuh::Array; @@ -85,7 +83,7 @@ array[42] = 6.28f; // random access with [] std::copy(begin(ha), end(ha), begin(array)); // forward-iterable ``` -## Unified (```vuh::mem::Unified```) +### Unified (```vuh::mem::Unified```) Allocation for these arrays takes place in a device local and host visible memory. Although such labeled is all memory in integrated GPUs, that is not the target use case. It is rather for the devices such as some Radeon cards that have some (relatively small) @@ -95,3 +93,20 @@ from the host side. There is no fall-back allocation strategy, if allocation in device-local & host-visible memory fails exception is thrown. Construction and data exchange interface mirrors that of ```vuh::mem::Host``` allocated arrays. + +## Iterators +Iterators provide means to copy around parts of ```vuh::Array``` data and constitute the interface of the ```copy_async``` family of functions. +Iterators to device data are created with ```device_begin()```, ```device_end()``` helper functions. +Offsetting those can be done with just a + operator (so those are kind of random access iterators). +```cpp +copy_async(device_begin(array_src), device_begin(array_src) + chunk_size + , device_begin(array_dst)); +``` +Iterators to host-accesible data (if such exists for the allocator used with particular array) are obtained with the usual ```begin()``` and ```end()```. These are true random access iterators and are implemented just as pointers at the moment. + +## Array views +ArrayView is the non-owning read-write range of continuous data of some ```Array``` object. +It serves mainly as a tool to pass partial arrays to computational kernels. +ArrayView can be used interchangeably with Array for that purpose. +Copy operations at the moment do not support views and rely fully on iterators for similar tasks. +The convenience way to create the ArrayView is the ```array_view``` factory function. diff --git a/doc/features_not_to_come.md b/doc/features_not_to_come.md index 2abc378..9fd9cde 100644 --- a/doc/features_not_to_come.md +++ b/doc/features_not_to_come.md @@ -2,4 +2,4 @@ This is to list the non-goals or things that are simply not achievable with a current approach - Full-blown Vulkan wrapper - this is first and foremost GPGPU helper - Library of compute shader primitives - worth of a separate project -- Dynamic parallelism - not doable in Vulkan-SPIR-V framework. +- Dynamic parallelism - I do not see how it is doable in the Vulkan-SPIR-V framework. diff --git a/doc/features_to_come.md b/doc/features_to_come.md index 6b31976..1518b57 100644 --- a/doc/features_to_come.md +++ b/doc/features_to_come.md @@ -6,9 +6,9 @@ This is to keep track of ideas on what (big) features could/should be implemente - dynamic uniforms - memory pooling - using multiple queues on a single device -- pipelining kernels and data transfers with GPU-side sync +- async data transfers and kernel execution with GPU-side sync - option to use in no-exception environments - headers generation from shaders - better integration/data exchange with graphic pipelines - compile for no-GPU environments -- Python interface +- Python bindings diff --git a/readme.md b/readme.md index 5e83ffd..f9db943 100644 --- a/readme.md +++ b/readme.md @@ -59,7 +59,7 @@ void main(){ + specialization constants (to set workgroup dimensions, etc...) + push-constants (to pass small data (<= 128 Bytes), like task dimensions etc...) + whatever compute shaders support, shared memory, etc... -- async data transfer/kernel execution with host-side synchronization +- asynchronous data transfer and kernel execution with host-side synchronization - multiple device support - [yet to come...](doc/features_to_come.md) - [not ever coming...](doc/features_not_to_come.md) diff --git a/src/include/vuh/arr/arrayView.hpp b/src/include/vuh/arr/arrayView.hpp index 8dd4924..0b10a21 100644 --- a/src/include/vuh/arr/arrayView.hpp +++ b/src/include/vuh/arr/arrayView.hpp @@ -32,7 +32,7 @@ namespace vuh { std::size_t _offset_end; ///< offset (number of array elements) of the end (one past the last valid elements) of the span. }; // class ArrayView - /// doc me + /// Create a ArrayView into given Array. template auto array_view(Array& array, std::size_t offset_begin, size_t offset_end)-> ArrayView{ return ArrayView(array, offset_begin, offset_end); diff --git a/src/include/vuh/program.hpp b/src/include/vuh/program.hpp index 6bfd033..3c83e05 100644 --- a/src/include/vuh/program.hpp +++ b/src/include/vuh/program.hpp @@ -116,10 +116,8 @@ namespace vuh { // submit the command buffer to the queue and set up a fence. auto queue = _device.computeQueue(); - auto fence = _device.createFence(vk::FenceCreateInfo()); // fence makes sure the control is not returned to CPU till command buffer is depleted - queue.submit({submitInfo}, fence); - _device.waitForFences({fence}, true, uint64_t(-1)); // -1 means wait for the fence indefinitely - _device.destroyFence(fence); + queue.submit({submitInfo}, nullptr); + queue.waitIdle(); } /// Run the Program object on previously bound parameters.