Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
run model with all FFN layers on SYCL
Browse files Browse the repository at this point in the history
  • Loading branch information
luoyu-intel committed Jun 5, 2024
1 parent 87c5823 commit 9e2c971
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 15 deletions.
3 changes: 2 additions & 1 deletion bestla/bestla/sycl/sycl_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ class SyclDevice {

auto d_selector{sycl::default_selector_v};
if (profile) {
sycl::property_list prop = {sycl::property::queue::enable_profiling()};
sycl::property_list prop = {sycl::property::queue::enable_profiling(), sycl::property::queue::in_order()};
mQueue = sycl::queue(d_selector, exception_handler, prop);
} else {
sycl::property_list prop = {sycl::property::queue::in_order()};
mQueue = sycl::queue(d_selector, exception_handler);
}
}
Expand Down
6 changes: 5 additions & 1 deletion neural_speed/core/layers/ne_bestla.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,12 @@ ne_backend bestla_backend_support(struct ne_tensor* src0, struct ne_tensor* src1
bk = src_on_devce ? NE_BACKEND_SYCL : NE_BACKEND_CPU;
}
} break;
case NE_OP_SILU:
case NE_OP_MUL: {
if (src0->type == NE_TYPE_F32 || src0->type == NE_TYPE_F16) {
bk = src_on_devce ? NE_BACKEND_SYCL : NE_BACKEND_CPU;
}
}
} break;
default:
break;
}
Expand All @@ -216,6 +217,9 @@ bool bestla_support(struct ne_tensor* node, int n_threads, size_t* workspace, si
size_t ws_h = 0;
size_t ws_d = 0;
bool support = false;
if (node->backend==NE_BACKEND_SYCL) {
support = true;
}
switch (node->op) {
case NE_OP_MUL_MAT: {
struct ne_tensor* wei = node->src0;
Expand Down
82 changes: 77 additions & 5 deletions neural_speed/core/layers/ne_bestla_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ void bestla_device_load_storage(void* hoststor, void* devstor, void* deviceptr,
}
}


template <class GCT>
using ProAT = sycl_prologue_a::ActivationBase<GCT, float>;
template <class GCT>
Expand All @@ -153,20 +152,93 @@ void bestla_device_f32f32_forward(float* activation, void* weiptr, float* output
auto dstor = (sycl_storage::StorageWeightKBlockNInteger*)weiptr;
if (_m == 1) {
using ProB = ProBTransT<GemmCore>;
auto e_esimd = ProB::gemv(activation, {(uint8_t*)dstor->mQBuf, (float*)dstor->mSBuf, dstor->mCStep}, output, _n, _k,
auto ev = ProB::gemv(activation, {(uint8_t*)dstor->mQBuf, (float*)dstor->mSBuf, dstor->mCStep}, output, _n, _k,
dstor->mBlockSize, (sycl::queue*)queue);
ev.wait();
} else {
using KernelTLauncher = sycl_wrapper::LauncherWOQ<ProAT, ProBTransT, EpiT, GemmCore>;
utils::GemmProblem gp(1, _m, _n, _k);
auto e_esimd = KernelTLauncher::compute(
auto ev = KernelTLauncher::compute(
(sycl::queue*)queue, _m, _n, _k, dstor->mBlockSize,
{{activation, lda}, {(uint8_t*)dstor->mQBuf, (float*)dstor->mSBuf, dstor->mCStep}, {output, ldo}});
ev.wait();
}
}

void bestla_device_mul_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
const struct ne_tensor* src1, struct ne_tensor* dst)
{
const struct ne_tensor* src1, struct ne_tensor* dst) {
auto q = (sycl::queue*)params->dev_queue;

const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];

const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];

const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
const size_t nb02 = src0->nb[2];
const size_t nb03 = src0->nb[3];

const size_t nb10 = src1->nb[0];
const size_t nb11 = ne11 == 1 ? 0 : src1->nb[1];
const size_t nb12 = ne12 == 1 ? 0 : src1->nb[2];
const size_t nb13 = ne13 == 1 ? 0 : src1->nb[3];

const size_t nb0 = dst->nb[0];
const size_t nb1 = dst->nb[1];
const size_t nb2 = dst->nb[2];
const size_t nb3 = dst->nb[3];
sycl::range<1> num_items{ne00 * ne01 * ne02 * ne03};
auto ev = q->submit([&](sycl::handler& cgh) {
cgh.parallel_for(num_items, [=](auto it) {
int i = it;
int i00 = i % ne00;
i /= ne00;
int i01 = i % ne01;
i /= ne01;
int i02 = i % ne02;
i /= ne02;
int i03 = i % ne03;

int i13 = i03 % ne13;
int i12 = i02 % ne12;
int i11 = i01 % ne11;

float* dst_ptr = (float*)((char*)dst->data + i03 * nb3 + i02 * nb2 + i01 * nb1);
float* src0_ptr = (float*)((char*)src0->data + i03 * nb03 + i02 * nb02 + i01 * nb01);
float* src1_ptr = (float*)((char*)src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11);
dst_ptr[i00] = src0_ptr[i00] * src1_ptr[i00];
});
});
ev.wait();
}

void bestla_device_elewise_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
struct ne_tensor* dst) {
auto q = (sycl::queue*)params->dev_queue;
auto op = dst->op;
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];

sycl::range<1> num_items{ne00 * ne01 * ne02 * ne03};
auto ev = q->submit([&](sycl::handler& cgh) {
cgh.parallel_for(num_items, [=](auto it) {
int i = it;
float* dst_ptr = ((float*)dst->data + i);
float srcval = *((float*)src0->data + i);
if (op == NE_OP_SILU) {
srcval = ne_silu_f32(srcval);
}
*dst_ptr = srcval;
});
});
ev.wait();
}
#endif
1 change: 1 addition & 0 deletions neural_speed/core/ne_bestla.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ void bestla_device_f32f32_forward(float* activation, void* weiptr, float* output
int ldo, void* workspace, void* queue);
void bestla_device_mul_f32(const struct ne_compute_params* params, const struct ne_tensor* src0,
const struct ne_tensor* src1, struct ne_tensor* dst);
void bestla_device_elewise_f32(const struct ne_compute_params* params, const struct ne_tensor* src0, struct ne_tensor* dst);
#endif
#ifdef __cplusplus
}
Expand Down
16 changes: 10 additions & 6 deletions neural_speed/core/ne_layers.c
Original file line number Diff line number Diff line change
Expand Up @@ -2141,11 +2141,12 @@ struct ne_tensor* ne_silu_impl(struct ne_context* ctx, struct ne_tensor* a, bool
if (!inplace && (a->grad)) {
is_node = true;
}
enum ne_op op = NE_OP_SILU;
enum ne_backend bk = bestla_backend_support(a, NULL, op);
struct ne_tensor* result = inplace ? ne_view_tensor_bk(ctx, a, bk) : ne_dup_tensor_bk(ctx, a, bk);

struct ne_tensor* result = inplace ? ne_view_tensor(ctx, a) : ne_dup_tensor(ctx, a);

result->op = NE_OP_SILU;
result->grad = is_node ? ne_dup_tensor(ctx, result) : NULL;
result->op = op;
result->grad = NULL;
result->src0 = a;
result->src1 = NULL;
return result;
Expand Down Expand Up @@ -4446,7 +4447,7 @@ static void ne_compute_forward_add_f32(const struct ne_compute_params* params, c
if (params->type == NE_TASK_FINALIZE) {
#ifdef NS_SYCL
if (params->ith == 0) {
if (src1->backend != NE_BACKEND_CPU) {
if (dst->backend != NE_BACKEND_CPU) {
bestla_device_memcpy_sync(dst->data, dstptr, dst->size, params->dev_queue);
}
}
Expand Down Expand Up @@ -6274,7 +6275,10 @@ static void ne_compute_forward_silu_f32(const struct ne_compute_params* params,
NE_ASSERT(ne_is_contiguous_except_dim_1(src0));
NE_ASSERT(ne_is_contiguous_except_dim_1(dst));
NE_ASSERT(ne_are_same_shape(src0, dst));

if (dst->backend == NE_BACKEND_SYCL) {
bestla_device_elewise_f32(params, src0, dst);
return;
}
if (params->type == NE_TASK_INIT || params->type == NE_TASK_FINALIZE) {
return;
}
Expand Down
4 changes: 2 additions & 2 deletions neural_speed/models/llama/llama_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,9 @@ void Llama::load(model_context* ctx, model_progress_callback progress_callback,
if (ml->verify_tensor(layers_i + ".feed_forward.w1.weight")) {
NE_ASSERT(n_expert == 0);
NE_ASSERT(n_expert_used == 0);
layer.ffn[0] = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
layer.ffn[0] = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, NE_BACKEND_SYCL);
layer.ffn[1] = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {n_ff, n_embd}, NE_BACKEND_SYCL);
layer.ffn[2] = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
layer.ffn[2] = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, NE_BACKEND_SYCL);
} else {
NE_ASSERT(n_expert > 0);
NE_ASSERT(n_expert_used > 0);
Expand Down

0 comments on commit 9e2c971

Please sign in to comment.