From 4ef245a92a968ba0f18a5adfd41e51980ce4fdf5 Mon Sep 17 00:00:00 2001 From: Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> Date: Thu, 22 Feb 2024 18:15:13 +1000 Subject: [PATCH 01/15] mpt : add optional bias tensors (#5638) Update for MPT with optional bias parameters: to work with PhoGPT and SEA-LION models that were pre-trained with 'bias'. --- llama.cpp | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/llama.cpp b/llama.cpp index 259f2a3a3ea00..9cae8c761f3ac 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4054,6 +4054,8 @@ static bool llm_load_tensors( // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } @@ -4063,14 +4065,23 @@ static bool llm_load_tensors( auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false); - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); - layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false); // AWQ ScaleActivation layer layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); @@ -6171,7 +6182,7 @@ struct llm_build_context { attn_norm = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, - NULL, + model.layers[il].attn_norm_b, LLM_NORM, cb, il); cb(attn_norm, "attn_norm", il); @@ -6181,6 +6192,11 @@ struct llm_build_context { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); + + if (model.layers[il].bqkv){ + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } if (hparams.f_clamp_kqv > 0.0f) { cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -6198,7 +6214,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, NULL, + model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -6211,13 +6227,13 @@ struct llm_build_context { { cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, - NULL, + model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); cur = llm_build_ffn(ctx0, cur, - model.layers[il].ffn_up, NULL, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, - model.layers[il].ffn_down, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_act, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); @@ -6234,7 +6250,7 @@ struct llm_build_context { cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, - NULL, + model.output_norm_b, LLM_NORM, cb, -1); cb(cur, "result_norm", -1); From c5688c6250430d2b8e0259efcf26c16dfa4c1f46 Mon Sep 17 00:00:00 2001 From: Alexey Parfenov Date: Thu, 22 Feb 2024 08:27:32 +0000 Subject: [PATCH 02/15] server : clarify some params in the docs (#5640) --- examples/server/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 4b24ee5dc3f28..4b6cd8326efa8 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -151,7 +151,7 @@ node index.js `temperature`: Adjust the randomness of the generated text (default: 0.8). - `dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled). + `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled). `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0). @@ -209,7 +209,7 @@ node index.js `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1) - `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false) + `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false) `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) @@ -242,7 +242,7 @@ Notice that each `probs` is an array of length `n_probs`. - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string. - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) -- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model` +- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.). - `model`: The path to the model loaded with `-m` - `prompt`: The provided `prompt` - `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token From a46f50747b2028f7f9c9883b26bfba12bf92556e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 22 Feb 2024 09:33:24 +0100 Subject: [PATCH 03/15] server : fallback to chatml, add AlphaMonarch chat template (#5628) * server: fallback to chatml * add new chat template * server: add AlphaMonarch to test chat template * server: only check model template if there is no custom tmpl * remove TODO --- examples/server/server.cpp | 15 +++++++++++++++ llama.cpp | 9 +++++++++ tests/test-chat-template.cpp | 23 +++++++++++++++-------- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c84719a0d15d0..369121e885b27 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -400,6 +400,16 @@ struct llama_server_context return true; } + void validate_model_chat_template(server_params & sparams) { + llama_chat_message chat[] = {{"user", "test"}}; + std::vector buf(1); + int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size()); + if (res < 0) { + LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); + sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template + } + } + void initialize() { // create slots all_slots_are_idle = true; @@ -2752,6 +2762,11 @@ int main(int argc, char **argv) LOG_INFO("model loaded", {}); } + if (sparams.chat_template.empty()) { // custom chat template is not supplied + // check if the template comes with the model is supported by us + llama.validate_model_chat_template(sparams); + } + // Middleware for API key validation auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool { // If API key is not set, skip validation diff --git a/llama.cpp b/llama.cpp index 9cae8c761f3ac..055b57e3187f2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12773,6 +12773,15 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|assistant|>\n"; } + } else if (tmpl.find("bos_token + message['role']") != std::string::npos) { + // mlabonne/AlphaMonarch-7B template (the is included inside history) + for (auto message : chat) { + std::string bos = (message == chat.front()) ? "" : ""; // skip BOS for first message + ss << bos << message->role << "\n" << message->content << "\n"; + } + if (add_ass) { + ss << "assistant\n"; + } } else { // template not supported return -1; diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index 9830650d4f8dd..d02b39e144947 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -27,12 +27,20 @@ int main(void) { "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <>\\\\n' + messages[idx]['content'] + '\\\\n<>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}", // bofenghuang/vigogne-2-70b-chat "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\\\n' + system_message + '\\\\n<>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\\\\n' + content.strip() + '\\\\n<>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", + // mlabonne/AlphaMonarch-7B + "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}", }; - std::vector expected_substr = { - "<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant", - "[/INST]Hi there[INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", - "[INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", - "[/INST] Hi there [INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", + std::vector expected_output = { + // teknium/OpenHermes-2.5-Mistral-7B + "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n", + // mistralai/Mistral-7B-Instruct-v0.2 + "[INST] You are a helpful assistant\nHello [/INST]Hi there[INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", + // TheBloke/FusionNet_34Bx2_MoE-AWQ + "[INST] <>\nYou are a helpful assistant\n<>\n\nHello [/INST] Hi there [INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", + // bofenghuang/vigogne-2-70b-chat + "[INST] <>\nYou are a helpful assistant\n<>\n\nHello [/INST] Hi there [INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", + // mlabonne/AlphaMonarch-7B + "system\nYou are a helpful assistant\nuser\nHello\nassistant\nHi there\nuser\nWho are you\nassistant\n I am an assistant \nuser\nAnother question\nassistant\n", }; std::vector formatted_chat(1024); int32_t res; @@ -43,7 +51,7 @@ int main(void) { for (size_t i = 0; i < templates.size(); i++) { std::string custom_template = templates[i]; - std::string substr = expected_substr[i]; + std::string expected = expected_output[i]; formatted_chat.resize(1024); res = llama_chat_apply_template( nullptr, @@ -57,8 +65,7 @@ int main(void) { formatted_chat.resize(res); std::string output(formatted_chat.data(), formatted_chat.size()); std::cout << output << "\n-------------------------\n"; - // expect the "formatted_chat" to contain pre-defined strings - assert(output.find(substr) != std::string::npos); + assert(output == expected); } return 0; } From 56d03d92be57f5880b9ed94542d87bb6effae31f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 10:35:54 +0200 Subject: [PATCH 04/15] readme : update hot topics --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c1624b9f9a348..3bc512af0602b 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Hot topics +- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) - Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631 - Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 From 3a03541cedea474fa9d41214484cc3fbcf468a9e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 13:54:03 +0200 Subject: [PATCH 05/15] minor : fix trailing whitespace (#5638) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 055b57e3187f2..6ab5e1bf4f409 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6192,7 +6192,7 @@ struct llm_build_context { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - + if (model.layers[il].bqkv){ cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); From 4cb4d8b22d4fda971621a68c570ce84d66897c37 Mon Sep 17 00:00:00 2001 From: Someone Date: Thu, 22 Feb 2024 16:32:09 +0000 Subject: [PATCH 06/15] workflows: nix: hardcode cachix ids, build unconditionally (#5663) GitHub does not expose environment and repository variables to PRs coming from forks implies that we've been disabling the Nix CI actions for most PRs. The `if:` also didn't make much sense, because we can always pull from cachix, and there's no point (albeit no risk either) in pushing cache for the untrusted code. --- .github/workflows/nix-ci-aarch64.yml | 7 +++---- .github/workflows/nix-ci.yml | 11 +++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml index 0c6cf5f091528..8d0a3fd7fd313 100644 --- a/.github/workflows/nix-ci-aarch64.yml +++ b/.github/workflows/nix-ci-aarch64.yml @@ -19,7 +19,6 @@ on: jobs: nix-build-aarch64: - if: ${{ vars.CACHIX_NAME != '' }} runs-on: ubuntu-latest steps: - name: Checkout repository @@ -37,8 +36,8 @@ jobs: extra-conf: | extra-platforms = aarch64-linux extra-system-features = nixos-test kvm - extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - uses: DeterminateSystems/magic-nix-cache-action@v2 with: upstream-cache: https://${{ matrix.cachixName }}.cachix.org @@ -46,7 +45,7 @@ jobs: uses: cachix/cachix-action@v13 with: authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: ${{ vars.CACHIX_NAME }} + name: llama-cpp - name: Show all output paths run: > nix run github:nix-community/nix-eval-jobs diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml index d19c7a576cdf6..01c5a9d5aaca2 100644 --- a/.github/workflows/nix-ci.yml +++ b/.github/workflows/nix-ci.yml @@ -23,8 +23,8 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} extra-conf: | - extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - uses: DeterminateSystems/magic-nix-cache-action@v2 with: upstream-cache: https://${{ matrix.cachixName }}.cachix.org @@ -37,7 +37,6 @@ jobs: --flake ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)" nix-build: - if: ${{ vars.CACHIX_NAME != '' }} strategy: fail-fast: false matrix: @@ -51,8 +50,8 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} extra-conf: | - extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - uses: DeterminateSystems/magic-nix-cache-action@v2 with: upstream-cache: https://${{ matrix.cachixName }}.cachix.org @@ -60,7 +59,7 @@ jobs: uses: cachix/cachix-action@v13 with: authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: ${{ vars.CACHIX_NAME }} + name: llama-cpp - name: Build run: > nix run github:Mic92/nix-fast-build From 373ee3fbbabc4c1508eed4f5c3795b23a20939a3 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 22 Feb 2024 19:10:21 +0100 Subject: [PATCH 07/15] Add Gemma chat template (#5665) * add gemma chat template * gemma: only apply system_prompt on non-model message --- llama.cpp | 22 ++++++++++++++++++++++ tests/test-chat-template.cpp | 4 ++++ 2 files changed, 26 insertions(+) diff --git a/llama.cpp b/llama.cpp index 6ab5e1bf4f409..40dda265ccc93 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12782,6 +12782,28 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "assistant\n"; } + } else if (tmpl.find("") != std::string::npos) { + // google/gemma-7b-it + std::string system_prompt = ""; + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken + system_prompt = trim(message->content); + continue; + } + // in gemma, "assistant" is "model" + role = role == "assistant" ? "model" : message->role; + ss << "" << role << "\n"; + if (!system_prompt.empty() && role != "model") { + ss << system_prompt << "\n\n"; + system_prompt = ""; + } + ss << trim(message->content) << "\n"; + } + if (add_ass) { + ss << "model\n"; + } } else { // template not supported return -1; diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index d02b39e144947..fa2eb577b6e42 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -29,6 +29,8 @@ int main(void) { "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\\\n' + system_message + '\\\\n<>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\\\\n' + content.strip() + '\\\\n<>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", // mlabonne/AlphaMonarch-7B "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}", + // google/gemma-7b-it + "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\\n' + message['content'] | trim + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\\n'}}{% endif %}", }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -41,6 +43,8 @@ int main(void) { "[INST] <>\nYou are a helpful assistant\n<>\n\nHello [/INST] Hi there [INST] Who are you [/INST] I am an assistant [INST] Another question [/INST]", // mlabonne/AlphaMonarch-7B "system\nYou are a helpful assistant\nuser\nHello\nassistant\nHi there\nuser\nWho are you\nassistant\n I am an assistant \nuser\nAnother question\nassistant\n", + // google/gemma-7b-it + "user\nYou are a helpful assistant\n\nHello\nmodel\nHi there\nuser\nWho are you\nmodel\nI am an assistant\nuser\nAnother question\nmodel\n", }; std::vector formatted_chat(1024); int32_t res; From 5a9e2f60ba3d8362ba17c77ac3092906d49b813f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 20:13:25 +0200 Subject: [PATCH 08/15] py : minor fixes (#5668) --- convert-hf-to-gguf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 9771fccf9ffc1..8630bbf2980c1 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -655,6 +655,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) def write_tensors(self): @@ -1031,7 +1033,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) def set_vocab(self): self._set_vocab_sentencepiece() From 201294ae177b308fb3a99dc504dd6d27e8afa907 Mon Sep 17 00:00:00 2001 From: Someone Date: Thu, 22 Feb 2024 19:44:10 +0000 Subject: [PATCH 09/15] nix: init singularity and docker images (#5056) Exposes a few attributes demonstrating how to build [singularity](https://docs.sylabs.io/guides/latest/user-guide/)/[apptainer](https://apptainer.org/) and Docker images re-using llama.cpp's Nix expression. Built locally on `x86_64-linux` with `nix build github:someoneserge/llama.cpp/feat/nix/images#llamaPackages.{docker,docker-min,sif,llama-cpp}` and it's fast and effective. --- .devops/nix/docker.nix | 37 +++++++++++++++++++++++++++++++++++++ .devops/nix/scope.nix | 3 +++ .devops/nix/sif.nix | 27 +++++++++++++++++++++++++++ 3 files changed, 67 insertions(+) create mode 100644 .devops/nix/docker.nix create mode 100644 .devops/nix/sif.nix diff --git a/.devops/nix/docker.nix b/.devops/nix/docker.nix new file mode 100644 index 0000000000000..d607b4575772c --- /dev/null +++ b/.devops/nix/docker.nix @@ -0,0 +1,37 @@ +{ + lib, + dockerTools, + buildEnv, + llama-cpp, + interactive ? true, + coreutils, +}: + +# A tar that can be fed into `docker load`: +# +# $ nix build .#llamaPackages.docker +# $ docker load < result + +# For details and variations cf. +# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage +# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922 +# - https://nixery.dev/ + +# Approximate (compressed) sizes, at the time of writing, are: +# +# .#llamaPackages.docker: 125M; +# .#llamaPackagesCuda.docker: 537M; +# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M. + +dockerTools.buildLayeredImage { + name = llama-cpp.pname; + tag = "latest"; + + contents = + [ llama-cpp ] + ++ lib.optionals interactive [ + coreutils + dockerTools.binSh + dockerTools.caCertificates + ]; +} diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix index d295995a4b96b..78530c9e8a230 100644 --- a/.devops/nix/scope.nix +++ b/.devops/nix/scope.nix @@ -12,5 +12,8 @@ lib.makeScope newScope ( self: { inherit llamaVersion; llama-cpp = self.callPackage ./package.nix { }; + docker = self.callPackage ./docker.nix { }; + docker-min = self.callPackage ./docker.nix { interactive = false; }; + sif = self.callPackage ./sif.nix { }; } ) diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix new file mode 100644 index 0000000000000..7535ca0f3088e --- /dev/null +++ b/.devops/nix/sif.nix @@ -0,0 +1,27 @@ +{ + lib, + singularity-tools, + llama-cpp, + bashInteractive, + interactive ? false, +}: + +let + optionalInt = cond: x: if cond then x else 0; +in +singularity-tools.buildImage rec { + inherit (llama-cpp) name; + contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ]; + + # These are excessive (but safe) for most variants. Building singularity + # images requires superuser privileges, so we build them inside a VM in a + # writable image of pre-determined size. + # + # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846 + # + # Expected image sizes: + # - cpu/blas: 150M, + # - cuda, all gencodes: 560M, + diskSize = 4096 + optionalInt llama-cpp.useRocm 16384; + memSize = diskSize; +} From efd56b1c2139d50b9b4381a212feb75d69598fda Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 18:31:40 +0200 Subject: [PATCH 10/15] ggml : 32-bit arm compat (whisper/1891) * ggml : 32-bit arm compat * ggml : add ggml_vqtbl1q_s8 impl * ggml : cont --- ggml-quants.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 6336538f0e99e..8917c8af14255 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -438,6 +438,30 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { return res; } +// NOTE: not tested +inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { + int8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + #else #define ggml_int16x8x2_t int16x8x2_t @@ -451,6 +475,7 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { #define ggml_vld1q_u8_x4 vld1q_u8_x4 #define ggml_vld1q_s8_x2 vld1q_s8_x2 #define ggml_vld1q_s8_x4 vld1q_s8_x4 +#define ggml_vqtbl1q_s8 vqtbl1q_s8 #endif @@ -9333,7 +9358,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const uint16_t gindex[8]; uint16x8x2_t vindex; int8x16x4_t q1b; - int8x16x4_t q8b; + ggml_int8x16x4_t q8b; uint16x8x4_t scales; int32x4x2_t sumi; int32x4x2_t dotq; @@ -9506,10 +9531,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * q8b.val[2] = vld1q_s8(y[ib+1].qs); q8b.val[3] = vld1q_s8(y[ib+1].qs + 16); - q4b.val[0] = vqtbl1q_s8(values, vandq_u8(q4bits.val[0], m4b)); - q4b.val[1] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); - q4b.val[2] = vqtbl1q_s8(values, vandq_u8(q4bits.val[1], m4b)); - q4b.val[3] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); + q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b)); + q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4)); + q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b)); + q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4)); prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); From 334f76fa385ed81095165e5ae068756214893901 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 23:21:05 +0200 Subject: [PATCH 11/15] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index bbbf88d9d6ff7..59de343706f2a 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -30805514e1bf389a59d30a54a0525cbdc30d5bd1 +8cdf783f288a98eddf521b0ab1b4d405be9e18ba From 7e4f339c404dbe029d4a117c03b37a9bf646cf0e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 23:21:39 +0200 Subject: [PATCH 12/15] ggml : always define ggml_fp16_t as uint16_t (#5666) * ggml : always define ggml_fp16_t as uint16_t ggml-ci * ggml : cont ggml-ci * ggml : cont * ggml : cont ggml-ci * ggml : cont ggml-ci * cuda : no longer ggml headers last ggml-ci * ggml : fix q6_K FP16 -> FP32 conversion ggml-ci * ggml : more FP16 -> FP32 conversion fixes ggml-ci --- ggml-cuda.cu | 9 ++++----- ggml-impl.h | 27 ++++++++++++++++++++------- ggml-quants.c | 30 +++++++++++++++--------------- ggml.c | 6 +++--- ggml.h | 6 ------ 5 files changed, 42 insertions(+), 36 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e7c211d7d6087..b0e454e025ec4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,3 +1,7 @@ +#include "ggml-cuda.h" +#include "ggml.h" +#include "ggml-backend-impl.h" + #include #include #include @@ -121,11 +125,6 @@ #endif // defined(GGML_USE_HIPBLAS) -// ggml-cuda need half type so keep ggml headers include at last -#include "ggml-cuda.h" -#include "ggml.h" -#include "ggml-backend-impl.h" - #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) #define CC_PASCAL 600 diff --git a/ggml-impl.h b/ggml-impl.h index 19df66bceee4a..c5637e4d45d8c 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -53,11 +53,23 @@ extern "C" { // #include -#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) -#define GGML_COMPUTE_FP32_TO_FP16(x) (x) +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + +#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); + return (float)tmp; +} -#define GGML_FP16_TO_FP32(x) ((float) (x)) -#define GGML_FP32_TO_FP16(x) (x) +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + __fp16 tmp = f; + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); + return res; +} #else @@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16]; // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. // This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) - +#if !defined(GGML_FP16_TO_FP32) inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { uint16_t s; memcpy(&s, &f, sizeof(uint16_t)); @@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { } #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#endif +#if !defined(GGML_FP32_TO_FP16) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif #define GGML_HASHTABLE_FULL ((size_t)-1) diff --git a/ggml-quants.c b/ggml-quants.c index 8917c8af14255..b15977f53e2f3 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -5654,8 +5654,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; - const float dmin = -y[i].d * (float)x[i].dmin; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -5804,8 +5804,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; - const float dmin = -y[i].d * (float)x[i].dmin; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); const uint8_t * restrict q2 = x[i].qs; const int8_t * restrict q8 = y[i].qs; @@ -6458,7 +6458,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1)); q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2)); @@ -6660,7 +6660,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]); - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); @@ -7163,9 +7163,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r aux16[1] = (a[0] >> 4) & 0x0f0f; const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]); - sum_mins += y[i].d * (float)x[i].d[1] * summi; + sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi; - const float d = y[i].d * (float)x[i].d[0]; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]); const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); @@ -7823,7 +7823,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const int8_t * sc = x[i].scales; const uint8_t * restrict q5 = x[i].qs; @@ -7965,7 +7965,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d = y[i].d * (float)x[i].d; + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const int8_t * sc = x[i].scales; const uint8_t * restrict q5 = x[i].qs; @@ -8533,7 +8533,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d_all = (float)x[i].d; + const float d_all = GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -8704,7 +8704,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; ++i) { - const float d_all = (float)x[i].d; + const float d_all = GGML_FP16_TO_FP32(x[i].d); const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -9523,7 +9523,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * float sumf = 0; for (int ib = 0; ib < nb; ib += 2) { - q4bits.val[0] = vld1q_u8(x[ib+0].qs); q4bits.val[1] = vld1q_u8(x[ib+1].qs); q8b.val[0] = vld1q_s8(y[ib+0].qs); @@ -9539,8 +9538,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]); prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); - sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2); - + sumf += + GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) + + GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2); } *s = sumf; diff --git a/ggml.c b/ggml.c index 5b9fa741a6479..d710fe702ddbd 100644 --- a/ggml.c +++ b/ggml.c @@ -323,7 +323,7 @@ float ggml_table_f32_f16[1 << 16]; // note: do not use these inside ggml.c // these are meant to be used via the ggml.h API float ggml_fp16_to_fp32(ggml_fp16_t x) { - return (float) GGML_FP16_TO_FP32(x); + return GGML_FP16_TO_FP32(x); } ggml_fp16_t ggml_fp32_to_fp16(float x) { @@ -798,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F16x8 float16x8_t #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) #define GGML_F16x8_SET1(x) vdupq_n_f16(x) - #define GGML_F16x8_LOAD vld1q_f16 + #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x)) #define GGML_F16x8_STORE vst1q_f16 #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define GGML_F16x8_ADD vaddq_f16 @@ -841,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F32Cx4 float32x4_t #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) - #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) + #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) #define GGML_F32Cx4_ADD vaddq_f32 diff --git a/ggml.h b/ggml.h index bed7a36a0ee6a..37eff627928e8 100644 --- a/ggml.h +++ b/ggml.h @@ -315,13 +315,7 @@ extern "C" { #endif -#if defined(__ARM_NEON) && defined(__CUDACC__) - typedef half ggml_fp16_t; -#elif defined(__ARM_NEON) && !defined(_MSC_VER) - typedef __fp16 ggml_fp16_t; -#else typedef uint16_t ggml_fp16_t; -#endif // convert FP16 <-> FP32 GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); From 847eedbdb2d1ebf14ef56eb507d4b4b975510908 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 23:22:48 +0200 Subject: [PATCH 13/15] py : add Gemma conversion from HF models (#5647) * py : add gemma conversion from HF models * Update convert-hf-to-gguf.py Co-authored-by: Aarni Koskela * Update convert-hf-to-gguf.py Co-authored-by: Aarni Koskela * Update convert-hf-to-gguf.py Co-authored-by: Jared Van Bortel --------- Co-authored-by: Aarni Koskela Co-authored-by: Jared Van Bortel --- convert-hf-to-gguf.py | 60 +++++++++++++++++++++++++++++++++++++++++++ llama.cpp | 3 +++ 2 files changed, 63 insertions(+) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 8630bbf2980c1..481198dad042c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -218,6 +218,8 @@ def from_model_architecture(model_architecture): return BertModel if model_architecture == "NomicBertModel": return NomicBertModel + if model_architecture == "GemmaForCausalLM": + return GemmaModel return Model def _is_model_safetensors(self) -> bool: @@ -277,6 +279,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH: return gguf.MODEL_ARCH.BERT if arch == "NomicBertModel": return gguf.MODEL_ARCH.NOMIC_BERT + if arch == "GemmaForCausalLM": + return gguf.MODEL_ARCH.GEMMA raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -1786,6 +1790,62 @@ def get_tensors(self): yield name, data +class GemmaModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + ###### CONVERSION LOGIC ###### diff --git a/llama.cpp b/llama.cpp index 40dda265ccc93..7770fa0e8f6fa 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7450,6 +7450,7 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); cb(inpL, "inp_embd", -1); + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -7491,6 +7492,7 @@ struct llm_build_context { n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); cb(Qcur, "Qcur_scaled", il); @@ -7505,6 +7507,7 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); cb(cur, "kqv_out", il); } + struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); From 96633eeca1265ed03e57230de54032041c58f9cd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Feb 2024 23:23:46 +0200 Subject: [PATCH 14/15] gemma : use more bits for the token_embd.weight tensor (#5650) * gemma : use Q8_0 for the token_embd.weight tensor * llama : quantize token_embd.weight using output type --- llama.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 7770fa0e8f6fa..2ebd40df234f0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10498,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty return std::make_pair(i_layer, n_layer); }; - if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { + // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings + // with the quantization of the output tensor + if (name == tn(LLM_TENSOR_OUTPUT, "weight") || + (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) { int nx = tensor->ne[0]; if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; From 15499eb94227401bdc8875da6eb85c15d37068f7 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Thu, 22 Feb 2024 17:05:23 -0500 Subject: [PATCH 15/15] mpt : do not duplicate token_embd.weight on disk (#5670) --- convert-hf-to-gguf.py | 5 ----- llama.cpp | 6 ++++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 481198dad042c..9bdfce07ab7db 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -622,11 +622,6 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) - # note: MPT output is tied to (same as) wte in original model; - # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ - if new_name == "token_embd.weight": - self.gguf_writer.add_tensor("output.weight", data) - class OrionModel(Model): def set_vocab(self): diff --git a/llama.cpp b/llama.cpp index 2ebd40df234f0..37477e6ef3c44 100644 --- a/llama.cpp +++ b/llama.cpp @@ -509,7 +509,6 @@ static std::map> LLM_TENSOR_NAMES = { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, - { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, @@ -4056,7 +4055,10 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + // same as tok_embd, duplicated to allow offloading + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); } for (int i = 0; i < n_layer; ++i) {