Add copyRegions to kp::OpTensorSyncDevice

Signed-off-by: crydsch <crydsch@lph.zone>
KomputeProject · Aug 17, 2023 · 65c0cbb · 65c0cbb
1 parent a8feda4
commit 65c0cbb
Show file tree

Hide file tree

Showing 6 changed files with 95 additions and 5 deletions.
diff --git a/src/OpTensorSyncDevice.cpp b/src/OpTensorSyncDevice.cpp
@@ -5,7 +5,8 @@
 namespace kp {
 
 OpTensorSyncDevice::OpTensorSyncDevice(
-  const std::vector<std::shared_ptr<Tensor>>& tensors)
+  const std::vector<std::shared_ptr<Tensor>>& tensors,
+  const std::vector<vk::BufferCopy>& copyRegions)
 {
     KP_LOG_DEBUG("Kompute OpTensorSyncDevice constructor with params");
 
@@ -15,13 +16,15 @@ OpTensorSyncDevice::OpTensorSyncDevice(
     }
 
     this->mTensors = tensors;
+    this->mCopyRegions = copyRegions;
 }
 
 OpTensorSyncDevice::~OpTensorSyncDevice()
 {
     KP_LOG_DEBUG("Kompute OpTensorSyncDevice destructor started");
 
     this->mTensors.clear();
+    this->mCopyRegions.clear();
 }
 
 void
@@ -31,7 +34,11 @@ OpTensorSyncDevice::record(const vk::CommandBuffer& commandBuffer)
 
     for (size_t i = 0; i < this->mTensors.size(); i++) {
         if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer);
+            if (i < this->mCopyRegions.size()) {
+               this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer, this->mCopyRegions[i]);
+            } else {
+               this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer);
+            }
         }
     }
 }

diff --git a/src/Tensor.cpp b/src/Tensor.cpp
@@ -217,6 +217,14 @@ Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer)
     vk::DeviceSize bufferSize(this->memorySize());
     vk::BufferCopy copyRegion(0, 0, bufferSize);
 
+    this->recordCopyFromStagingToDevice(commandBuffer, copyRegion);
+}
+
+void
+Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer, const vk::BufferCopy copyRegion)
+{
+    vk::DeviceSize bufferSize(this->memorySize());
+
     KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
 
     this->recordCopyBuffer(commandBuffer,

diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp
@@ -120,6 +120,21 @@ class Sequence : public std::enable_shared_from_this<Sequence>
         std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
         return this->eval(op);
     }
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param copyRegions Vector of buffer regions to copy (one per tensor)
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(std::vector<std::shared_ptr<Tensor>> tensors,
+                                   std::vector<vk::BufferCopy> copyRegions)
+    {
+        std::shared_ptr<T> op{ new T(tensors, copyRegions) };
+        return this->eval(op);
+    }
     /**
      * Eval sends all the recorded and stored operations in the vector of
      * operations into the gpu as a submit job with a barrier.

diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp
@@ -111,12 +111,22 @@ class Tensor
     /**
      * Records a copy from the internal staging memory to the device memory
      * using an optional barrier to wait for the operation. This function would
-     * only be relevant for kp::Tensors of type eDevice.
+     * only be relevant for kp::Tensors of type eDevice. Copies the entire tensor.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
      */
     void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer);
 
+    /**
+     * Records a copy from the internal staging memory to the device memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param copyRegion The buffer region to copy
+     */
+    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer, const vk::BufferCopy copyRegion);
+
     /**
      * Records a copy from the internal device memory to the staging memory
      * using an optional barrier to wait for the operation. This function would

diff --git a/src/include/kompute/operations/OpTensorSyncDevice.hpp b/src/include/kompute/operations/OpTensorSyncDevice.hpp
@@ -20,12 +20,14 @@ class OpTensorSyncDevice : public OpBase
   public:
     /**
      * Default constructor with parameters that provides the core vulkan
-     * resources and the tensors that will be used in the operation. The tensos
+     * resources and the tensors that will be used in the operation. The tensors
      * provided cannot be of type TensorTypes::eStorage.
+     * Optionally for each tensor a buffer region to copy can be specified.
      *
      * @param tensors Tensors that will be used to create in operation.
+     * @param copyRegions The buffer region to copy.
      */
-    OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors);
+    OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors, const std::vector<vk::BufferCopy>& copyRegions = {});
 
     /**
      * Default destructor. This class does not manage memory so it won't be
@@ -58,6 +60,7 @@ class OpTensorSyncDevice : public OpBase
   private:
     // -------------- ALWAYS OWNED RESOURCES
     std::vector<std::shared_ptr<Tensor>> mTensors;
+    std::vector<vk::BufferCopy> mCopyRegions;
 };
 
 } // End namespace kp
diff --git a/test/TestOpTensorSync.cpp b/test/TestOpTensorSync.cpp
@@ -53,3 +53,50 @@ TEST(TestOpTensorSync, SyncToDeviceMemoryMultiTensor)
     EXPECT_EQ(tensorB->vector(), testVec);
     EXPECT_EQ(tensorC->vector(), testVec);
 }
+
+TEST(TestOpTensorSync, SyncToDeviceMemoryCopyRegion)
+{
+
+    kp::Manager mgr;
+
+    std::vector<float> testVecPreA{ 1, 2, 3, 4 };
+    std::vector<float> testVecPostA{ 0, 1, 0, 0 };
+    std::vector<float> testVecPostB{ 0, 0, 0, 1 };
+    std::vector<float> testVecC{ 5, 6, 7, 8 };
+
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 0, 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor({ 0, 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorC = mgr.tensor({ 0, 0, 0, 0 });
+
+    EXPECT_TRUE(tensorA->isInit());
+    EXPECT_TRUE(tensorB->isInit());
+    EXPECT_TRUE(tensorC->isInit());
+
+    tensorA->setData(testVecPreA);
+    tensorC->setData(testVecC);
+
+    // TODO
+    //  should use custom kp::copyRegion struct with index/number of elements instead of bytes
+    //  how to handle out of bounds access?
+    //  add copyRegion support to kp::OpTensorCopy
+    //  add copyRegion support to kp::OpTensorSyncLocal
+    //  add template specialization to sequence->record()
+
+    // vk::BufferCopy copyRegion;
+    // copyRegion.srcOffset = 0; // in bytes
+    // copyRegion.dstOffset = 1 * tensorA->dataTypeMemorySize(); // in bytes
+    // copyRegion.size = 1 * tensorA->dataTypeMemorySize(); // in bytes
+    // mgr.sequence()->eval<kp::OpTensorSyncDevice>({ tensorA, tensorC }, { copyRegion });
+
+    EXPECT_EQ(sizeof(float), tensorA->dataTypeMemorySize());
+    mgr.sequence()->eval<kp::OpTensorSyncDevice>({ tensorA, tensorC }, { { 0, 1 * sizeof(float), 1 * sizeof(float) } });
+    // tensorA on the device now looks like: [ 0, 1, 0, 0 ]
+
+    mgr.sequence()->eval<kp::OpTensorCopy>({ tensorA, tensorB }); // TODO copy only tensorA index 1 to tensorB index 2
+
+    mgr.sequence()->eval<kp::OpTensorSyncLocal>({ tensorA, tensorB, tensorC }); // TODO copy tensorB index 2 to tensorB index 3
+
+    EXPECT_EQ(tensorA->vector(), testVecPostA);
+    EXPECT_EQ(tensorB->vector(), testVecPostA);
+    EXPECT_EQ(tensorC->vector(), testVecC);
+}