fix memory leak, set lower extra memory size.

intel · Jun 21, 2024 · 42fa774 · 42fa774
1 parent 145dc98
commit 42fa774
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 2 deletions.
diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h
@@ -133,6 +133,12 @@ class WeightKBlockNInteger {
     transposeWeight<int8_t>(srcstor.mK, srcstor.mN, s8buf, srcstor.mN, s8transbuf, srcstor.mKPad, threading);
     compressWeight(srcstor.mKPad, srcstor.mNPad, s8transbuf, srcstor.mKPad, dststor.WPtr<int8_t>(), srcstor.mDType,
                    threading);
+    if (s8buf) {
+      utils::afree(s8buf);
+    }
+    if (s8transbuf) {
+      utils::afree(s8transbuf);
+    }
     int nk_scale = utils::updiv(srcstor.mKPad, srcstor.mBlockSize);
     if (srcstor.mCorrection.mScaEleSize == 4) {
       transposeWeight<float>(nk_scale, srcstor.mNPad, srcstor.template SPtr<float>(), srcstor.mNPad,
@@ -141,6 +147,7 @@ class WeightKBlockNInteger {
       transposeWeight<uint16_t>(nk_scale, srcstor.mNPad, srcstor.template SPtr<uint16_t>(), srcstor.mNPad,
                                 dststor.template SPtr<uint16_t>(), dststor.CStep(), threading);
     }
+
   }
   AUTOCALL void doubleQuantScale(float* scale, size_t scale_size, int dq_blocksize, BTLA_DTYPE qtype,
                                  utils::aligned_vector<float>* dq_buf) {

diff --git a/neural_speed/core/layers/ne_bestla_sycl.cpp b/neural_speed/core/layers/ne_bestla_sycl.cpp
@@ -138,6 +138,9 @@ void bestla_device_load_storage(void* hoststor, void* devstor, void* deviceptr,
       dstor->fromHost(transtor, (sycl::queue*)device_queue);
     }
   }
+  if (ptr) {
+    delete ptr;
+  }
 }
 
 template <class GCT>

diff --git a/neural_speed/models/llama/llama_utils.cpp b/neural_speed/models/llama/llama_utils.cpp
@@ -97,8 +97,8 @@ void Llama::load(model_context* ctx, model_progress_callback progress_callback,
   int n_cpu_layer = n_layer - n_gpu_layer;
   n_cpu_layer = n_cpu_layer < 0 ? 0 : n_cpu_layer;
   fprintf(stderr, "%s: ctx size   = %7.2f MB\n", __func__, ctx_size / 1024.0 / 1024.0);
-  auto host_size = (ctx_size + (50 << 20)) * n_cpu_layer / n_layer + n_embd * n_vocab * sizeof(float);
-  auto device_size = (ctx_size + (50 << 20)) * n_gpu_layer / n_layer + n_embd * n_vocab * sizeof(float);
+  auto host_size = (ctx_size + (50 << 20)) * n_cpu_layer / n_layer + (50 << 20);
+  auto device_size = (ctx_size + (50 << 20)) * n_gpu_layer / n_layer + (50 << 20);
   fprintf(stderr, "%s: host ctx size   = %7.2f MB\n", __func__, host_size / 1024.0 / 1024.0);
 #ifdef NS_SYCL
   fprintf(stderr, "%s: device ctx size   = %7.2f MB\n", __func__, device_size / 1024.0 / 1024.0);