From c9c3220c485c7bea740a07cda7343677fb3beaae Mon Sep 17 00:00:00 2001 From: Erik Scholz Date: Tue, 5 Sep 2023 19:41:00 +0200 Subject: [PATCH 1/6] convert: fix convert.py not working with int filename_stem (#3028) * fix implicit int to string conversion * convert : remove an obsolete pyright comment --------- Co-authored-by: Cebtenzzre --- convert.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/convert.py b/convert.py index 5a7483b43d563..59d75141de5b2 100755 --- a/convert.py +++ b/convert.py @@ -673,7 +673,7 @@ def persistent_load(self, pid: Any) -> Any: assert isinstance(pid[1], LazyStorageKind) data_type = pid[1].data_type filename_stem = pid[2] - filename = self.data_base_path + '/' + filename_stem + filename = f'{self.data_base_path}/{filename_stem}' info = self.zip_file.getinfo(filename) def load(offset: int, elm_count: int) -> NDArray: @@ -689,7 +689,6 @@ def load(offset: int, elm_count: int) -> NDArray: @staticmethod def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, - # pyright: ignore[reportSelfClsParameterName] requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: assert isinstance(storage, LazyStorage) From de2fe892af92a5c7b5ef1beb7efbc0524343fbab Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Tue, 5 Sep 2023 15:10:27 -0400 Subject: [PATCH 2/6] examples : replace fprintf to stdout with printf (#3017) --- common/common.cpp | 188 +++++++++++++------------- common/log.h | 16 +-- examples/gguf/gguf.cpp | 42 +++--- examples/gptneox-wip/falcon-main.cpp | 66 ++++----- examples/gptneox-wip/gptneox-main.cpp | 62 ++++----- examples/llama-bench/llama-bench.cpp | 40 +++--- examples/server/server.cpp | 76 +++++------ 7 files changed, 245 insertions(+), 245 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d4f9dbf556299..22f65ac469b50 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -584,109 +584,109 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - fprintf(stdout, "usage: %s [options]\n", argv[0]); - fprintf(stdout, "\n"); - fprintf(stdout, "options:\n"); - fprintf(stdout, " -h, --help show this help message and exit\n"); - fprintf(stdout, " -i, --interactive run in interactive mode\n"); - fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n"); - fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); - fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); - fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n"); - fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n"); - fprintf(stdout, " (can be specified more than once for multiple prompts).\n"); - fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n"); - fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stdout, " -p PROMPT, --prompt PROMPT\n"); - fprintf(stdout, " prompt to start generation with (default: empty)\n"); - fprintf(stdout, " -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); - fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); - fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); - fprintf(stdout, " not supported with --interactive or other interactive options\n"); - fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); - fprintf(stdout, " --random-prompt start with a randomized prompt.\n"); - fprintf(stdout, " --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n"); - fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); - fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); - fprintf(stdout, " -f FNAME, --file FNAME\n"); - fprintf(stdout, " prompt file to start generation.\n"); - fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); - fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); - fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); - fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); - fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); - fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); - fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); - fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); - fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); - fprintf(stdout, " --mirostat N use Mirostat sampling.\n"); - fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); - fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); - fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); - fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); - fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); - fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n"); - fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); - fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); - fprintf(stdout, " --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); - fprintf(stdout, " --grammar-file FNAME file to read grammar from\n"); - fprintf(stdout, " --cfg-negative-prompt PROMPT\n"); - fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); - fprintf(stdout, " --cfg-negative-prompt-file FNAME\n"); - fprintf(stdout, " negative prompt file to use for guidance. (default: empty)\n"); - fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); - fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); - fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); - fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale); - fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); - fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); - fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); - fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp); - fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n"); - fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); - fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); - fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - fprintf(stdout, " --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); - fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); + printf("usage: %s [options]\n", argv[0]); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -i, --interactive run in interactive mode\n"); + printf(" --interactive-first run in interactive mode and wait for input right away\n"); + printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); + printf(" -r PROMPT, --reverse-prompt PROMPT\n"); + printf(" halt generation at PROMPT, return control in interactive mode\n"); + printf(" (can be specified more than once for multiple prompts).\n"); + printf(" --color colorise output to distinguish prompt and user input from generations\n"); + printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -p PROMPT, --prompt PROMPT\n"); + printf(" prompt to start generation with (default: empty)\n"); + printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); + printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); + printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); + printf(" not supported with --interactive or other interactive options\n"); + printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); + printf(" --random-prompt start with a randomized prompt.\n"); + printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n"); + printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n"); + printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); + printf(" -f FNAME, --file FNAME\n"); + printf(" prompt file to start generation.\n"); + printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); + printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); + printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); + printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); + printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); + printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); + printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); + printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); + printf(" --mirostat N use Mirostat sampling.\n"); + printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); + printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); + printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); + printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); + printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); + printf(" modifies the likelihood of token appearing in the completion,\n"); + printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); + printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); + printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); + printf(" --grammar-file FNAME file to read grammar from\n"); + printf(" --cfg-negative-prompt PROMPT\n"); + printf(" negative prompt to use for guidance. (default: empty)\n"); + printf(" --cfg-negative-prompt-file FNAME\n"); + printf(" negative prompt file to use for guidance. (default: empty)\n"); + printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); + printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); + printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); + printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale); + printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); + printf(" --no-penalize-nl do not penalize newline token\n"); + printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); + printf(" --temp N temperature (default: %.1f)\n", (double)params.temp); + printf(" --perplexity compute perplexity over each ctx window of the prompt\n"); + printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); + printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); + printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); + printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); + printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); if (llama_mlock_supported()) { - fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } if (llama_mmap_supported()) { - fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } - fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n"); - fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n"); - fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); + printf(" --numa attempt optimizations that help on some NUMA systems\n"); + printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); + printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); - fprintf(stdout, " number of layers to store in VRAM\n"); - fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); - fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -ts SPLIT --tensor-split SPLIT\n"); + printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n"); #ifdef GGML_USE_CUBLAS - fprintf(stdout, " -nommq, --no-mul-mat-q\n"); - fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); - fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); + printf(" -nommq, --no-mul-mat-q\n"); + printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); + printf(" Not recommended since this is both slower and uses more VRAM.\n"); #endif // GGML_USE_CUBLAS #endif - fprintf(stdout, " --mtest compute maximum memory usage\n"); - fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); - fprintf(stdout, " --verbose-prompt print prompt before generation\n"); + printf(" --mtest compute maximum memory usage\n"); + printf(" --export export the computation graph to 'llama.ggml'\n"); + printf(" --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); - fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stdout, " -m FNAME, --model FNAME\n"); - fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); - fprintf(stdout, " -md FNAME, --model-draft FNAME\n"); - fprintf(stdout, " draft model for speculative decoding (default: %s)\n", params.model.c_str()); - fprintf(stdout, " -ld LOGDIR, --logdir LOGDIR\n"); - fprintf(stdout, " path under which to save YAML logs (no logging if unset)\n"); - fprintf(stdout, "\n"); + printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" -m FNAME, --model FNAME\n"); + printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -md FNAME, --model-draft FNAME\n"); + printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str()); + printf(" -ld LOGDIR, --logdir LOGDIR\n"); + printf(" path under which to save YAML logs (no logging if unset)\n"); + printf("\n"); } std::string gpt_random_prompt(std::mt19937 & rng) { diff --git a/common/log.h b/common/log.h index 0b9b01052b87c..18f3b9761a788 100644 --- a/common/log.h +++ b/common/log.h @@ -513,16 +513,16 @@ inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & inline void log_print_usage() { - fprintf(stdout, "log options:\n"); + printf("log options:\n"); /* format - fprintf(stdout, " -h, --help show this help message and exit\n");*/ + printf(" -h, --help show this help message and exit\n");*/ /* spacing - fprintf(stdout, "__-param----------------Description\n");*/ - fprintf(stdout, " --log-test Run simple logging test\n"); - fprintf(stdout, " --log-disable Disable trace logs\n"); - fprintf(stdout, " --log-enable Enable trace logs\n"); - fprintf(stdout, " --log-file Specify a log filename (without extension)\n"); - fprintf(stdout, " Log file will be tagged with unique ID and written as \"..log\"\n"); /* */ + printf("__-param----------------Description\n");*/ + printf(" --log-test Run simple logging test\n"); + printf(" --log-disable Disable trace logs\n"); + printf(" --log-enable Enable trace logs\n"); + printf(" --log-file Specify a log filename (without extension)\n"); + printf(" Log file will be tagged with unique ID and written as \"..log\"\n"); /* */ } #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index cda517bde405f..a34010f1022a7 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -76,7 +76,7 @@ bool gguf_ex_write(const std::string & fname) { gguf_write_to_file(ctx, fname.c_str(), false); - fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str()); + printf("%s: wrote file '%s;\n", __func__, fname.c_str()); ggml_free(ctx_data); gguf_free(ctx); @@ -93,20 +93,20 @@ bool gguf_ex_read_0(const std::string & fname) { struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); - fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx)); - fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); - fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); + printf("%s: version: %d\n", __func__, gguf_get_version(ctx)); + printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); // kv { const int n_kv = gguf_get_n_kv(ctx); - fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); + printf("%s: n_kv: %d\n", __func__, n_kv); for (int i = 0; i < n_kv; ++i) { const char * key = gguf_get_key(ctx, i); - fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); + printf("%s: kv[%d]: key = %s\n", __func__, i, key); } } @@ -116,10 +116,10 @@ bool gguf_ex_read_0(const std::string & fname) { const int keyidx = gguf_find_key(ctx, findkey); if (keyidx == -1) { - fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey); + printf("%s: find key: %s not found.\n", __func__, findkey); } else { const char * key_value = gguf_get_val_str(ctx, keyidx); - fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value); + printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value); } } @@ -127,13 +127,13 @@ bool gguf_ex_read_0(const std::string & fname) { { const int n_tensors = gguf_get_n_tensors(ctx); - fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); + printf("%s: n_tensors: %d\n", __func__, n_tensors); for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name (ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i); - fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); } } @@ -153,20 +153,20 @@ bool gguf_ex_read_1(const std::string & fname) { struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); - fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx)); - fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); - fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); + printf("%s: version: %d\n", __func__, gguf_get_version(ctx)); + printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); // kv { const int n_kv = gguf_get_n_kv(ctx); - fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); + printf("%s: n_kv: %d\n", __func__, n_kv); for (int i = 0; i < n_kv; ++i) { const char * key = gguf_get_key(ctx, i); - fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); + printf("%s: kv[%d]: key = %s\n", __func__, i, key); } } @@ -174,13 +174,13 @@ bool gguf_ex_read_1(const std::string & fname) { { const int n_tensors = gguf_get_n_tensors(ctx); - fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); + printf("%s: n_tensors: %d\n", __func__, n_tensors); for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name (ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i); - fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); } } @@ -189,13 +189,13 @@ bool gguf_ex_read_1(const std::string & fname) { const int n_tensors = gguf_get_n_tensors(ctx); for (int i = 0; i < n_tensors; ++i) { - fprintf(stdout, "%s: reading tensor %d data\n", __func__, i); + printf("%s: reading tensor %d data\n", __func__, i); const char * name = gguf_get_tensor_name(ctx, i); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); + printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); // print first 10 elements const float * data = (const float *) cur->data; @@ -219,7 +219,7 @@ bool gguf_ex_read_1(const std::string & fname) { } } - fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data)); + printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data)); ggml_free(ctx_data); gguf_free(ctx); @@ -229,7 +229,7 @@ bool gguf_ex_read_1(const std::string & fname) { int main(int argc, char ** argv) { if (argc < 3) { - fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]); + printf("usage: %s data.gguf r|w\n", argv[0]); return -1; } diff --git a/examples/gptneox-wip/falcon-main.cpp b/examples/gptneox-wip/falcon-main.cpp index 43b6a29f312ed..d4b130b254c00 100644 --- a/examples/gptneox-wip/falcon-main.cpp +++ b/examples/gptneox-wip/falcon-main.cpp @@ -305,9 +305,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name) struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); if( cur == NULL ) { - fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str()); + printf("%s: tensor '%s' not found!\n", __func__, name.c_str()); } else { -// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); +// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); } return cur; @@ -333,21 +333,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_ return false; } - fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); - fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); - fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); + printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); + printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); + printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); // print all kv #if 0 { const int n_kv = gguf_get_n_kv(ggufctx); - fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); + printf("%s: n_kv: %d\n", __func__, n_kv); for (int i = 0; i < n_kv; ++i) { const char * key = gguf_get_key(ggufctx, i); - fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); + printf("%s: kv[%d]: key = %s\n", __func__, i, key); } } #endif @@ -357,21 +357,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_ int keyidx; keyidx = gguf_find_key(ggufctx, "general.name"); - if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.description"); - if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.author"); - if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.license"); - if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.file_type"); - if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); - if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); - if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } } // check required metadata @@ -382,11 +382,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_ keyidx = gguf_find_key(ggufctx, "general.architecture"); if (keyidx != -1) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) { - fprintf(stdout, "%s: model architecture not supported!\n", __func__); + printf("%s: model architecture not supported!\n", __func__); return false; } } else { - fprintf(stdout, "%s: gguf model architecture not found!\n", __func__); + printf("%s: gguf model architecture not found!\n", __func__); return false; } @@ -394,11 +394,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_ keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout"); if (keyidx != -1) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) { - fprintf(stdout, "%s: model tensor data layout not supported!\n", __func__); + printf("%s: model tensor data layout not supported!\n", __func__); return false; } } else { - fprintf(stdout, "%s: gguf model tensor data layout not found!\n", __func__); + printf("%s: gguf model tensor data layout not found!\n", __func__); return false; } @@ -455,11 +455,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_ if (keyidx != -1) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { - fprintf(stdout, "%s: tokenizer model not supported!\n", __func__); + printf("%s: tokenizer model not supported!\n", __func__); return false; } } else { - fprintf(stdout, "%s: tokenizer model not found!\n", __func__); + printf("%s: tokenizer model not found!\n", __func__); return false; } @@ -467,22 +467,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_ int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); if (tokens_keyidx == -1) { - fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__); + printf("%s: gpt2 tokenizer vocab not found!\n", __func__); return false; } int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); if (merges_keyidx == -1) { - fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__); + printf("%s: gpt2 tokenizer merges not found!\n", __func__); return false; } hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); - fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); - fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); + printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); + printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); for (size_t i = 0; i < hparams.n_vocab; i++) { std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); @@ -523,12 +523,12 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_ keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } - if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } - if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } - if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } - if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } - if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); } + if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } + if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } + if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } + if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } + if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } + if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); } } @@ -543,13 +543,13 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_ { const int n_tensors = gguf_get_n_tensors(ggufctx); - fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); + printf("%s: n_tensors: %d\n", __func__, n_tensors); for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name (ggufctx, i); const size_t offset = gguf_get_tensor_offset(ggufctx, i); - fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); } } #endif diff --git a/examples/gptneox-wip/gptneox-main.cpp b/examples/gptneox-wip/gptneox-main.cpp index 6291523f2f69e..b6cc46c5f4299 100644 --- a/examples/gptneox-wip/gptneox-main.cpp +++ b/examples/gptneox-wip/gptneox-main.cpp @@ -318,9 +318,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name) struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); if( cur == NULL ) { - fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str()); + printf("%s: tensor '%s' not found!\n", __func__, name.c_str()); } else { -// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); +// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); } return cur; @@ -346,21 +346,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 return false; } - fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); - fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); - fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); + printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); + printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); + printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); // print all kv #if 0 { const int n_kv = gguf_get_n_kv(ggufctx); - fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); + printf("%s: n_kv: %d\n", __func__, n_kv); for (int i = 0; i < n_kv; ++i) { const char * key = gguf_get_key(ggufctx, i); - fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); + printf("%s: kv[%d]: key = %s\n", __func__, i, key); } } #endif @@ -370,21 +370,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 int keyidx; keyidx = gguf_find_key(ggufctx, "general.name"); - if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.description"); - if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.author"); - if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.license"); - if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.architecture"); - if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.file_type"); - if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); - if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); - if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } + if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } } // check required metadata @@ -395,11 +395,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 keyidx = gguf_find_key(ggufctx, "general.architecture"); if (keyidx != -1) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) { - fprintf(stdout, "%s: model architecture not supported!\n", __func__); + printf("%s: model architecture not supported!\n", __func__); return false; } } else { - fprintf(stdout, "%s: gguf model architecture not found!\n", __func__); + printf("%s: gguf model architecture not found!\n", __func__); return false; } @@ -456,11 +456,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 if (keyidx != -1) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { - fprintf(stdout, "%s: tokenizer model not supported!\n", __func__); + printf("%s: tokenizer model not supported!\n", __func__); return false; } } else { - fprintf(stdout, "%s: tokenizer model not found!\n", __func__); + printf("%s: tokenizer model not found!\n", __func__); return false; } @@ -468,22 +468,22 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); if (tokens_keyidx == -1) { - fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__); + printf("%s: gpt2 tokenizer vocab not found!\n", __func__); return false; } int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); if (merges_keyidx == -1) { - fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__); + printf("%s: gpt2 tokenizer merges not found!\n", __func__); return false; } hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); - fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); - fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); + printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); + printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); for (size_t i = 0; i < hparams.n_vocab; i++) { std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); @@ -524,12 +524,12 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } - if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } - if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } - if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } - if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } - if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } - if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); } + if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } + if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } + if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } + if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } + if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } + if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); } } @@ -543,13 +543,13 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 { const int n_tensors = gguf_get_n_tensors(ggufctx); - fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); + printf("%s: n_tensors: %d\n", __func__, n_tensors); for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name (ggufctx, i); const size_t offset = gguf_get_tensor_offset(ggufctx, i); - fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); } } #endif diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index bf3a487abd305..72a025077b359 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -165,26 +165,26 @@ static const cmd_params cmd_params_defaults = { }; static void print_usage(int /* argc */, char ** argv) { - fprintf(stdout, "usage: %s [options]\n", argv[0]); - fprintf(stdout, "\n"); - fprintf(stdout, "options:\n"); - fprintf(stdout, " -h, --help\n"); - fprintf(stdout, " -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - fprintf(stdout, " -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); - fprintf(stdout, " -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - fprintf(stdout, " -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - fprintf(stdout, " --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str()); - fprintf(stdout, " -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - fprintf(stdout, " -ngl N, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); - fprintf(stdout, " -mg i, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - fprintf(stdout, " -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str()); - fprintf(stdout, " -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); - fprintf(stdout, " -ts, --tensor_split \n"); - fprintf(stdout, " -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); - fprintf(stdout, " -o, --output (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql"); - fprintf(stdout, " -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); - fprintf(stdout, "\n"); - fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); + printf("usage: %s [options]\n", argv[0]); + printf("\n"); + printf("options:\n"); + printf(" -h, --help\n"); + printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str()); + printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -ngl N, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -mg i, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str()); + printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); + printf(" -ts, --tensor_split \n"); + printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); + printf(" -o, --output (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql"); + printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); + printf("\n"); + printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 94def943b9a0a..6b606447da749 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -118,7 +118,7 @@ static void server_log(const char *level, const char *function, int line, } const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); - fprintf(stdout, "%.*s\n", (int)str.size(), str.data()); + printf("%.*s\n", (int)str.size(), str.data()); fflush(stdout); } @@ -694,50 +694,50 @@ struct llama_server_context static void server_print_usage(const char *argv0, const gpt_params ¶ms, const server_params &sparams) { - fprintf(stdout, "usage: %s [options]\n", argv0); - fprintf(stdout, "\n"); - fprintf(stdout, "options:\n"); - fprintf(stdout, " -h, --help show this help message and exit\n"); - fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); - fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); - fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); - fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); + printf("usage: %s [options]\n", argv0); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); + printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); if (llama_mlock_supported()) { - fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } if (llama_mmap_supported()) { - fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } - fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n"); + printf(" --numa attempt optimizations that help on some NUMA systems\n"); #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); - fprintf(stdout, " number of layers to store in VRAM\n"); - fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); - fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); - fprintf(stdout, " -nommq, --no-mul-mat-q\n"); - fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); - fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -ts SPLIT --tensor-split SPLIT\n"); + printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n"); + printf(" -nommq, --no-mul-mat-q\n"); + printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); + printf(" Not recommended since this is both slower and uses more VRAM.\n"); #endif - fprintf(stdout, " -m FNAME, --model FNAME\n"); - fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); - fprintf(stdout, " -a ALIAS, --alias ALIAS\n"); - fprintf(stdout, " set an alias for the model, will be added as `model` field in completion response\n"); - fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stdout, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); - fprintf(stdout, " --port PORT port to listen (default (default: %d)\n", sparams.port); - fprintf(stdout, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); - fprintf(stdout, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); - fprintf(stdout, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); - fprintf(stdout, "\n"); + printf(" -m FNAME, --model FNAME\n"); + printf(" model path (default: %s)\n", params.model.c_str()); + printf(" -a ALIAS, --alias ALIAS\n"); + printf(" set an alias for the model, will be added as `model` field in completion response\n"); + printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); + printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); + printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); + printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); + printf("\n"); } static void server_params_parse(int argc, char **argv, server_params &sparams, @@ -1595,7 +1595,7 @@ int main(int argc, char **argv) svr.set_base_dir(sparams.public_path); // to make it ctrl+clickable: - fprintf(stdout, "\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); + printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); LOG_INFO("HTTP server listening", { {"hostname", sparams.hostname}, From 9e2023156e5b5acabaf8632e66c6ae68d3703c31 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Tue, 5 Sep 2023 15:12:00 -0400 Subject: [PATCH 3/6] make : use new flag variables for recent changes (#3019) --- Makefile | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 139fa02a8fd4b..fe7ddc9ef002f 100644 --- a/Makefile +++ b/Makefile @@ -109,12 +109,11 @@ endif ifdef LLAMA_CODE_COVERAGE - CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase '' + MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase '' endif ifdef LLAMA_DISABLE_LOGS - CFLAGS += -DLOG_DISABLE_LOGS - CXXFLAGS += -DLOG_DISABLE_LOGS + MK_CPPFLAGS += -DLOG_DISABLE_LOGS endif # LLAMA_DISABLE_LOGS # warnings @@ -124,7 +123,7 @@ MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-m ifeq '' '$(findstring clang++,$(CXX))' # g++ only - CXXFLAGS += -Wno-format-truncation + MK_CXXFLAGS += -Wno-format-truncation endif # OS specific @@ -188,8 +187,8 @@ endif # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 # https://github.com/ggerganov/llama.cpp/issues/2922 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' - CFLAGS += -Xassembler -muse-unaligned-vector-move - CXXFLAGS += -Xassembler -muse-unaligned-vector-move + MK_CFLAGS += -Xassembler -muse-unaligned-vector-move + MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move endif ifneq ($(filter aarch64%,$(UNAME_M)),) @@ -226,8 +225,8 @@ ifneq ($(filter ppc64%,$(UNAME_M)),) endif else - CFLAGS += -march=rv64gcv -mabi=lp64d - CXXFLAGS += -march=rv64gcv -mabi=lp64d + MK_CFLAGS += -march=rv64gcv -mabi=lp64d + MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d endif ifndef LLAMA_NO_K_QUANTS @@ -247,16 +246,6 @@ ifndef LLAMA_NO_ACCELERATE endif endif # LLAMA_NO_ACCELERATE -ifdef LLAMA_METAL - # By default - use GPU acceleration on Mac OS - ifeq ($(UNAME_S),Darwin) - CFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG - CXXFLAGS += -DGGML_USE_METAL - LDFLAGS += -framework Foundation -framework Metal -framework MetalKit - OBJS += ggml-metal.o - endif -endif # LLAMA_METAL - ifdef LLAMA_MPI MK_CPPFLAGS += -DGGML_USE_MPI MK_CFLAGS += -Wno-cast-qual @@ -368,7 +357,7 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h endif # LLAMA_HIPBLAS ifdef LLAMA_METAL - MK_CPPFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG + MK_CPPFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit OBJS += ggml-metal.o endif # LLAMA_METAL From 9912b9efc8922321fe7202ab42ba913833cbe9cd Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Tue, 5 Sep 2023 18:21:10 -0400 Subject: [PATCH 4/6] build : add LLAMA_METAL_NDEBUG flag (#3033) --- CMakeLists.txt | 5 ++++- Makefile | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e872ae310761b..d4ed6179ea7a7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) +option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) @@ -174,7 +175,9 @@ if (LLAMA_METAL) set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) add_compile_definitions(GGML_USE_METAL) - #add_compile_definitions(GGML_METAL_NDEBUG) + if (LLAMA_METAL_NDEBUG) + add_compile_definitions(GGML_METAL_NDEBUG) + endif() # get full path to the file #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") diff --git a/Makefile b/Makefile index fe7ddc9ef002f..4334761a44d27 100644 --- a/Makefile +++ b/Makefile @@ -360,6 +360,9 @@ ifdef LLAMA_METAL MK_CPPFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit OBJS += ggml-metal.o +ifdef LLAMA_METAL_NDEBUG + MK_CPPFLAGS += -DGGML_METAL_NDEBUG +endif endif # LLAMA_METAL ifdef LLAMA_METAL From ea2c85d5d2a93d39d0172222917f3195f0e456ff Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Wed, 6 Sep 2023 02:49:11 -0600 Subject: [PATCH 5/6] convert-llama-ggml-to-gguf: Try to handle files older than GGJTv3 (#3023) * convert-llama-ggmlv3-to-gguf: Try to handle files older than GGJTv3 * Better error messages for files that cannot be converted * Add file type to GGUF output * Rename to convert-llama-ggml-to-gguf.py * Include original file type information in description * Improve some informational output --- ...o-gguf.py => convert-llama-ggml-to-gguf.py | 168 ++++++++++++++---- 1 file changed, 133 insertions(+), 35 deletions(-) rename convert-llama-ggmlv3-to-gguf.py => convert-llama-ggml-to-gguf.py (68%) diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggml-to-gguf.py similarity index 68% rename from convert-llama-ggmlv3-to-gguf.py rename to convert-llama-ggml-to-gguf.py index 08ba0c490cd1e..b5d3e0b3c3ace 100755 --- a/convert-llama-ggmlv3-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -5,6 +5,7 @@ import math import struct import sys +from enum import IntEnum from pathlib import Path import numpy as np @@ -34,10 +35,35 @@ gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8), } +class GGMLFormat(IntEnum): + GGML = 0 + GGMF = 1 + GGJT = 2 + +class GGMLFType(IntEnum): + ALL_F32 = 0 + MOSTLY_F16 = 1 + MOSTLY_Q4_0 = 2 + MOSTLY_Q4_1 = 3 + MOSTLY_Q4_1_SOME_F16 = 4 + MOSTLY_Q8_0 = 7 + MOSTLY_Q5_0 = 8 + MOSTLY_Q5_1 = 9 + MOSTLY_Q2_K = 10 + MOSTLY_Q3_K_S = 11 + MOSTLY_Q3_K_M = 12 + MOSTLY_Q3_K_L = 13 + MOSTLY_Q4_K_S = 14 + MOSTLY_Q4_K_M = 15 + MOSTLY_Q5_K_S = 16 + MOSTLY_Q5_K_M = 17 + MOSTLY_Q6_K = 18 + class Hyperparameters: def __init__(self): - self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0 - self.n_ff = 0 + self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0 + self.n_layer = self.n_rot = self.n_ff = 0 + self.ftype = GGMLFType.ALL_F32 def set_n_ff(self, model): ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight') @@ -53,16 +79,21 @@ def load(self, data, offset): self.n_head, self.n_layer, self.n_rot, - self.ftype, + ftype, ) = struct.unpack('<7I', data[offset:offset + (4 * 7)]) + try: + self.ftype = GGMLFType(ftype) + except ValueError: + raise ValueError(f'Invalid ftype {ftype}') return 4 * 7 def __str__(self): - return f'' + return f'' class Vocab: - def __init__(self): + def __init__(self, load_scores = True): self.items = [] + self.load_scores = load_scores def load(self, data, offset, n_vocab): orig_offset = offset @@ -70,20 +101,24 @@ def load(self, data, offset, n_vocab): itemlen = struct.unpack(' 3: + raise ValueError(f'Cannot handle unexpected GGJT file version {version}') + self.file_format = GGMLFormat.GGJT + self.format_version = version + return 8 + raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.") + + def validate_conversion(self, ftype): + err = '' + if (self.file_format < GGMLFormat.GGJT or self.format_version < 2): + if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16): + err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.' + elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2): + if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1, + GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0): + err = 'Q4 and Q8 quantizations changed in GGJTv3.' + if len(err) > 0: + raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.') def load(self, data, offset): offset += self.validate_header(data, offset) hp = Hyperparameters() offset += hp.load(data, offset) - vocab = Vocab() + print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}') + self.validate_conversion(hp.ftype) + vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML) offset += vocab.load(data, offset, hp.n_vocab) tensors: list[Tensor] = [] tensor_map = {} while offset < len(data): - tensor = Tensor() + tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF) offset += tensor.load(data, offset) tensor_map[tensor.name] = len(tensors) tensors.append(tensor) @@ -168,7 +235,10 @@ def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override def save(self): print('* Preparing to save GGUF file') - gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False) + gguf_writer = gguf.GGUFWriter( + self.cfg.output, + gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], + use_temp_file = False ) self.add_params(gguf_writer) self.add_vocab(gguf_writer) if self.special_vocab is not None: @@ -185,7 +255,10 @@ def save(self): def add_params(self, gguf_writer): hp = self.model.hyperparameters cfg = self.cfg - desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format' + if cfg.desc is not None: + desc = cfg.desc + else: + desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format' try: # Filenames aren't necessarily valid UTF8. name = cfg.name if cfg.name is not None else cfg.input.name @@ -195,6 +268,7 @@ def add_params(self, gguf_writer): if name is not None: gguf_writer.add_name(name) gguf_writer.add_description(desc) + gguf_writer.add_file_type(int(hp.ftype)) if self.params_override is not None: po = self.params_override assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch' @@ -231,7 +305,8 @@ def add_vocab(self, gguf_writer): tokens.append(vbytes) scores.append(score) toktypes.append(ttype) - assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}' + assert len(tokens) == hp.n_vocab, \ + f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}' gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) if len(toktypes) > 0: @@ -283,7 +358,11 @@ def add_tensors(self, gguf_writer): tempdims[1] = tempdims[0] tempdims[0] = temp # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}') - gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype) + gguf_writer.add_tensor( + mapped_name, + data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], + raw_shape = tempdims, + raw_dtype = tensor.dtype ) def handle_metadata(cfg, hp): import convert @@ -305,32 +384,46 @@ def handle_metadata(cfg, hp): params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path) else: raise ValueError('Unable to load metadata') - vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype) + vocab = convert.load_vocab( + cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, + cfg.vocabtype ) # FIXME: Respect cfg.vocab_dir? svocab = gguf.SpecialVocab(cfg.model_metadata_dir) convert.check_vocab_size(params, vocab) return (params, vocab, svocab) def handle_args(): - parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF') - parser.add_argument('--input', '-i', type = Path, required = True, help = 'Input GGMLv3 filename') - parser.add_argument('--output', '-o', type = Path, required = True, help ='Output GGUF filename') - parser.add_argument('--name', help = 'Set model name') - parser.add_argument('--desc', help = 'Set model description') - parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') - parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') - parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') - parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory') - parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") - parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm") + parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF') + parser.add_argument('--input', '-i', type = Path, required = True, + help = 'Input GGMLv3 filename') + parser.add_argument('--output', '-o', type = Path, required = True, + help ='Output GGUF filename') + parser.add_argument('--name', + help = 'Set model name') + parser.add_argument('--desc', + help = 'Set model description') + parser.add_argument('--gqa', type = int, default = 1, + help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') + parser.add_argument('--eps', default = '5.0e-06', + help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') + parser.add_argument('--context-length', '-c', type=int, default = 2048, + help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') + parser.add_argument('--model-metadata-dir', '-m', type = Path, + help ='Load HuggingFace/.pth vocab and metadata from the specified directory') + parser.add_argument("--vocab-dir", type=Path, + help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") + parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm", + help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") return parser.parse_args() def main(): cfg = handle_args() print(f'* Using config: {cfg}') print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n') + if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'): + print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') data = np.memmap(cfg.input, mode = 'r') - model = GGMLV3Model() + model = GGMLModel() print('* Scanning GGML input file') offset = model.load(data, 0) print(f'* GGML model hyperparameters: {model.hyperparameters}') @@ -345,7 +438,12 @@ def main(): print(f'* Special vocab: {special_vocab}') else: print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') - converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab) + if model.file_format == GGMLFormat.GGML: + print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') + converter = GGMLToGGUF(model, data, cfg, + params_override = params_override, + vocab_override = vocab_override, + special_vocab = special_vocab ) converter.save() print(f'* Successful completion. Output saved to: {cfg.output}') From 178b1850ebd21b349cebbee887950e435c5aa2d3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 6 Sep 2023 12:40:57 +0300 Subject: [PATCH 6/6] k-quants : fix zero-weight guard in Q6_K (ref #3040) --- k_quants.c | 1 + 1 file changed, 1 insertion(+) diff --git a/k_quants.c b/k_quants.c index 8742d4aee6f54..eb702ce86acd9 100644 --- a/k_quants.c +++ b/k_quants.c @@ -1089,6 +1089,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict if (!max_abs_scale) { memset(&y[i], 0, sizeof(block_q6_K)); y[i].d = ggml_fp32_to_fp16(0.f); + x += QK_K; continue; }