Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
fix memory leak, set lower extra memory size.
Browse files Browse the repository at this point in the history
  • Loading branch information
luoyu-intel committed Jun 21, 2024
1 parent 145dc98 commit 42fa774
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 2 deletions.
7 changes: 7 additions & 0 deletions bestla/bestla/bestla_prologue_b.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ class WeightKBlockNInteger {
transposeWeight<int8_t>(srcstor.mK, srcstor.mN, s8buf, srcstor.mN, s8transbuf, srcstor.mKPad, threading);
compressWeight(srcstor.mKPad, srcstor.mNPad, s8transbuf, srcstor.mKPad, dststor.WPtr<int8_t>(), srcstor.mDType,
threading);
if (s8buf) {
utils::afree(s8buf);
}
if (s8transbuf) {
utils::afree(s8transbuf);
}
int nk_scale = utils::updiv(srcstor.mKPad, srcstor.mBlockSize);
if (srcstor.mCorrection.mScaEleSize == 4) {
transposeWeight<float>(nk_scale, srcstor.mNPad, srcstor.template SPtr<float>(), srcstor.mNPad,
Expand All @@ -141,6 +147,7 @@ class WeightKBlockNInteger {
transposeWeight<uint16_t>(nk_scale, srcstor.mNPad, srcstor.template SPtr<uint16_t>(), srcstor.mNPad,
dststor.template SPtr<uint16_t>(), dststor.CStep(), threading);
}

}
AUTOCALL void doubleQuantScale(float* scale, size_t scale_size, int dq_blocksize, BTLA_DTYPE qtype,
utils::aligned_vector<float>* dq_buf) {
Expand Down
3 changes: 3 additions & 0 deletions neural_speed/core/layers/ne_bestla_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ void bestla_device_load_storage(void* hoststor, void* devstor, void* deviceptr,
dstor->fromHost(transtor, (sycl::queue*)device_queue);
}
}
if (ptr) {
delete ptr;
}
}

template <class GCT>
Expand Down
4 changes: 2 additions & 2 deletions neural_speed/models/llama/llama_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ void Llama::load(model_context* ctx, model_progress_callback progress_callback,
int n_cpu_layer = n_layer - n_gpu_layer;
n_cpu_layer = n_cpu_layer < 0 ? 0 : n_cpu_layer;
fprintf(stderr, "%s: ctx size = %7.2f MB\n", __func__, ctx_size / 1024.0 / 1024.0);
auto host_size = (ctx_size + (50 << 20)) * n_cpu_layer / n_layer + n_embd * n_vocab * sizeof(float);
auto device_size = (ctx_size + (50 << 20)) * n_gpu_layer / n_layer + n_embd * n_vocab * sizeof(float);
auto host_size = (ctx_size + (50 << 20)) * n_cpu_layer / n_layer + (50 << 20);
auto device_size = (ctx_size + (50 << 20)) * n_gpu_layer / n_layer + (50 << 20);
fprintf(stderr, "%s: host ctx size = %7.2f MB\n", __func__, host_size / 1024.0 / 1024.0);
#ifdef NS_SYCL
fprintf(stderr, "%s: device ctx size = %7.2f MB\n", __func__, device_size / 1024.0 / 1024.0);
Expand Down

0 comments on commit 42fa774

Please sign in to comment.