diff --git a/Makefile b/Makefile index 10e2a0904ba60..3ceea61967486 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ BUILD_TARGETS = \ llama-imatrix \ llama-infill \ llama-llava-cli \ - llama-minicpmv-cli\ + llama-minicpmv-cli\ llama-lookahead \ llama-lookup \ llama-lookup-create \ @@ -1352,7 +1352,7 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h exampl $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) - + llama-export-lora: examples/export-lora/export-lora.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md index 0dfcd5f71c802..4affc1d0f26ff 100644 --- a/examples/llava/README-minicpmv2.5.md +++ b/examples/llava/README-minicpmv2.5.md @@ -96,4 +96,4 @@ Now, you can start chatting: ``` $cd /data/data/com.termux/files/home/bin $./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" -``` \ No newline at end of file +``` diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index c66e140606edd..acb3a3464b582 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1863,7 +1863,7 @@ static std::vector> uhd_slice_image(const clip_imag std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); - + auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); clip_image_u8 * refine_image = clip_image_u8_init(); bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); @@ -2558,6 +2558,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } -bool clip_is_minicpmv(const struct clip_ctx * ctx) { +bool clip_is_minicpmv(const struct clip_ctx * ctx) { return ctx->has_minicpmv_projector; -} \ No newline at end of file +} diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index d64a11b65d2bd..916d9dc401dc4 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -254,7 +254,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); int patch_size=14; load_image_size->width = img_res_v.data[i].nx; - load_image_size->height = img_res_v.data[i].ny; + load_image_size->height = img_res_v.data[i].ny; clip_add_load_image_size(ctx_clip, load_image_size); const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); if (!encoded) { @@ -278,7 +278,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } image_embd_v.clear(); load_image_size->width = img->nx; - load_image_size->height = img->ny; + load_image_size->height = img->ny; clip_add_load_image_size(ctx_clip, load_image_size); LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); } @@ -292,7 +292,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli return false; } - } + } else { // spatial_unpad llava-1.6 type embedding // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 6a1ae358dd9ed..f951b57b29158 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -122,7 +122,7 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) { float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip)); std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); - + auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); slice_embed->embed = image_embed; slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); @@ -223,7 +223,7 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla } static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ - + const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); return tmp; } @@ -272,7 +272,7 @@ int main(int argc, char ** argv) { if (strstr(tmp, "###")) break; // Yi-VL behavior have_tmp = true; printf("%s", tmp); - if (strstr(response.c_str(), "")) break; // minicpm-v + if (strstr(response.c_str(), "")) break; // minicpm-v fflush(stdout); } @@ -292,18 +292,18 @@ int main(int argc, char ** argv) { if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior printf("%s", tmp);// mistral llava-1.6 - if (strstr(response.c_str(), "")) break; // minicpm-v + if (strstr(response.c_str(), "")) break; // minicpm-v fflush(stdout); } llama_sampling_free(ctx_sampling); } } printf("\n"); - llama_print_timings(ctx_llava->ctx_llama); + llama_print_timings(ctx_llava->ctx_llama); ctx_llava->model = NULL; llava_free(ctx_llava); } return 0; -} \ No newline at end of file +} diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index 8e18985aceff9..dfe5fbe62cea6 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -2,4 +2,4 @@ --extra-index-url https://download.pytorch.org/whl/cpu pillow~=10.2.0 torch~=2.2.1 -torchvision==0.16.2 \ No newline at end of file +torchvision==0.17.1