Merge branch 'concedo_experimental' into frankenstein_3

Nexesenex · May 1, 2024 · e9d5cec · e9d5cec
2 parents b686666 + b48ea96
commit e9d5cec
Show file tree

Hide file tree

Showing 49 changed files with 4,764 additions and 1,079 deletions.
diff --git a/README.md b/README.md
@@ -11,10 +11,10 @@ KoboldCpp is an easy-to-use AI text-generation software for GGML and GGUF models
 - **[Download the latest .exe release here](https://github.com/LostRuins/koboldcpp/releases/latest)** or clone the git repo.
 - Windows binaries are provided in the form of **koboldcpp.exe**, which is a pyinstaller wrapper for a few **.dll** files and **koboldcpp.py**. You can also rebuild it yourself with the provided makefiles and scripts.
 - Weights are not included, you can use the official llama.cpp `quantize.exe` to generate them from your official weight files (or download them from other places such as [TheBloke's Huggingface](https://huggingface.co/TheBloke).
-- To run, execute **koboldcpp.exe** or drag and drop your quantized `ggml_model.bin` file onto the .exe, and then connect with Kobold or Kobold Lite. If you're not on windows, then run the script **KoboldCpp.py** after compiling the libraries.
+- To run, simply execute **koboldcpp.exe**.
 - Launching with no command line arguments displays a GUI containing a subset of configurable settings. Generally you dont have to change much besides the `Presets` and `GPU Layers`. Read the `--help` for more info about each settings.
 - By default, you can connect to http://localhost:5001
-- You can also run it using the command line `koboldcpp.exe [ggml_model.bin] [port]`. For info, please check `koboldcpp.exe --help`
+- You can also run it using the command line. For info, please check `koboldcpp.exe --help`
 
 ### Improving Performance
 - **(Nvidia Only) GPU Acceleration**: If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag, make sure you select the correct .exe with CUDA support.
@@ -64,23 +64,7 @@ when you can't use the precompiled binary directly, we provide an automated buil
 - Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds.
 
 ### Arch Linux Packages
-There are 4 community made AUR packages (Maintained by @AlpinDale) available: [CPU-only](https://aur.archlinux.org/packages/koboldcpp-cpu), [CLBlast](https://aur.archlinux.org/packages/koboldcpp-clblast), [CUBLAS](https://aur.archlinux.org/packages/koboldcpp-cuda), and [HIPBLAS](https://aur.archlinux.org/packages/koboldcpp-hipblas). They are, respectively, for users with no GPU, users with a GPU (vendor-agnostic), users with NVIDIA GPUs, and users with a supported AMD GPU.
-
-The recommended installation method is through an AUR helper such as [paru](https://aur.archlinux.org/packages/paru) or [yay](https://aur.archlinux.org/packages/yay):
-
-```sh
-paru -S koboldcpp-cpu
-```
-
-Alternatively, you can manually install, though it's not recommended (since the build depends on [customtkinter](https://aur.archlinux.org/packages/customtkinter)):
-
-```sh
-git clone https://aur.archlinux.org/koboldcpp-cpu.git && cd koboldcpp-cpu
-
-makepkg -si
-```
-
-You can then run koboldcpp anywhere from the terminal by running `koboldcpp` to spawn the GUI, or `koboldcpp --help` to view the list of commands for commandline execution (in case the GUI does not work).
+There are some community made AUR packages (Maintained by @AlpinDale) available: [CUBLAS](https://aur.archlinux.org/packages/koboldcpp-cuda), and [HIPBLAS](https://aur.archlinux.org/packages/koboldcpp-hipblas). They are intended for users with NVIDIA GPUs, and users with a supported AMD GPU. Note that these packages may be outdated, and it's probably better to use official KoboldCpp binaries.
 
 ## Compiling on Windows
 - You're encouraged to use the .exe released, but if you want to compile your binaries from source at Windows, the easiest way is:

diff --git a/common/common.cpp b/common/common.cpp
diff --git a/common/common.h b/common/common.h
@@ -31,6 +31,8 @@
     fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)
 
+#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+
 // build info
 
 struct llama_control_vector_load_info;
@@ -108,7 +110,7 @@ struct gpt_params {
     // // sampling parameters
     struct llama_sampling_params sparams;
 
-    std::string model                = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model                = "";  // model path
     std::string model_draft          = "";  // draft model for speculative decoding
     std::string model_alias          = "unknown"; // model alias
     std::string model_url            = "";  // model url to download
@@ -149,7 +151,7 @@ struct gpt_params {
     bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
     size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
 
-    bool   kl_divergence   = false; // compute KL-divergence
+    bool   kl_divergence   = false; // compute KL divergence
 
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
@@ -164,6 +166,7 @@ struct gpt_params {
     bool multiline_input   = false; // reverse the usage of `\`
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
+    bool flash_attn        = false; // flash attention
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
@@ -183,10 +186,12 @@ struct gpt_params {
     std::string cache_type_v = "f16"; // KV cache data type for the V
 
     // multimodal models (see examples/llava)
-    std::string mmproj = ""; // path to multimodal projector
-    std::string image  = ""; // path to an image file
+    std::string mmproj = "";        // path to multimodal projector
+    std::vector<std::string> image; // path to image file(s)
 };
 
+void gpt_params_handle_model_default(gpt_params & params);
+
 bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
@@ -212,6 +217,7 @@ bool validate_file_name(const std::string & filename);
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
+std::string string_strip(const std::string & str);
 std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
 
 //

diff --git a/common/log.h b/common/log.h
@@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
-#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
     #define LOG_IMPL(str, ...)                                                                                      \
     do {                                                                                                            \
         if (LOG_TARGET != nullptr)                                                                                  \
@@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
-#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
     #define LOG_TEE_IMPL(str, ...)                                                                                                      \
     do {                                                                                                                                \
         if (LOG_TARGET != nullptr)                                                                                                      \

diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -68,7 +68,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
 
 void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
     if (seed == LLAMA_DEFAULT_SEED) {
-        seed = time(NULL);
+        seed = std::random_device{}();
     }
     ctx->rng.seed(seed);
 }