From b63ad9a5d28fd3d1e8003ffa48b157000716538b Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Thu, 8 Aug 2024 11:30:58 +0800
Subject: [PATCH] init

---
 Makefile                        |  20 +-
 examples/llava/minicpmv-cli.cpp | 334 ++++++++++++++++----------------
 2 files changed, 180 insertions(+), 174 deletions(-)
diff --git a/Makefile b/Makefile
index a157b1e1ceef8e..e70564603b1ff9 100644
--- a/Makefile
+++ b/Makefile
@@ -950,15 +950,21 @@ llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/lla
 	$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
 
-FFMPEG_CFLAGS := $(shell pkg-config --cflags libavformat libavcodec libavutil)
-FFMPEG_LIBS := $(shell pkg-config --libs libavformat libavcodec libavutil) -lswscale
+# FFMPEG_CFLAGS := $(shell pkg-config --cflags libavformat libavcodec libavutil)
+# FFMPEG_LIBS := $(shell pkg-config --libs libavformat libavcodec libavutil) -lswscale
+
+# llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+# 	$(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+# 	$(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c examples/llava/clip.cpp  -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
+# 	$(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
+# 	$(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) $(FFMPEG_LIBS)
 
 llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c examples/llava/clip.cpp  -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
-	$(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
-	$(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) $(FFMPEG_LIBS)
-	
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp  -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
+	$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
+
 llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 6f46c2a1419b52..809e5c9ecd23c6 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -9,12 +9,12 @@
 #include <cstdlib>
 #include <vector>
 
-extern "C" {
-    #include <libavcodec/avcodec.h>
-    #include <libavformat/avformat.h>
-    #include <libavutil/imgutils.h>
-    #include <libswscale/swscale.h>
-}
+// extern "C" {
+//     #include <libavcodec/avcodec.h>
+//     #include <libavformat/avformat.h>
+//     #include <libavutil/imgutils.h>
+//     #include <libswscale/swscale.h>
+// }
 
 struct llava_context {
     struct clip_ctx * ctx_clip = NULL;
@@ -28,133 +28,133 @@ struct clip_image_u8 {
     std::vector<uint8_t> buf;
 };
 
-static std::vector<clip_image_u8 *> extract_frames(const std::string& video_path) {
-    AVFormatContext* format_ctx = nullptr;
-    if (avformat_open_input(&format_ctx, video_path.c_str(), nullptr, nullptr) < 0) {
-        LOG_TEE("Could not open video file.");
-        return {};
-    }
-
-    if (avformat_find_stream_info(format_ctx, nullptr) < 0) {
-        LOG_TEE("Could not find stream information.");
-        avformat_close_input(&format_ctx);
-        return {};
-    }
-
-    const AVCodec* codec = nullptr;
-    AVCodecContext* codec_ctx = nullptr;
-    int video_stream_index = -1;
-
-    // Find the video stream
-    for (size_t i = 0; i < format_ctx->nb_streams; ++i) {
-        if (format_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
-            codec = avcodec_find_decoder(format_ctx->streams[i]->codecpar->codec_id);
-            if (codec) {
-                video_stream_index = i;
-                break;
-            }
-        }
-    }
-
-    if (video_stream_index == -1) {
-        LOG_TEE("Could not find video stream.");
-        avformat_close_input(&format_ctx);
-        return {};
-    }
-
-    codec_ctx = avcodec_alloc_context3(codec);
-    if (!codec_ctx) {
-        LOG_TEE("Could not allocate video codec context.");
-        avformat_close_input(&format_ctx);
-        return {};
-    }
-
-    if (avcodec_parameters_to_context(codec_ctx, format_ctx->streams[video_stream_index]->codecpar) < 0) {
-        LOG_TEE("Could not copy codec parameters to codec context.");
-        avcodec_free_context(&codec_ctx);
-        avformat_close_input(&format_ctx);
-        return {};
-    }
-
-    if (avcodec_open2(codec_ctx, codec, nullptr) < 0) {
-        LOG_TEE("Could not open codec.");
-        avcodec_free_context(&codec_ctx);
-        avformat_close_input(&format_ctx);
-        return {};
-    }
-
-    AVFrame* frame = av_frame_alloc();
-    AVFrame* frame_rgb = av_frame_alloc();
-    if (!frame || !frame_rgb) {
-        LOG_TEE("Could not allocate frames.");
-        av_frame_free(&frame);
-        av_frame_free(&frame_rgb);
-        avcodec_free_context(&codec_ctx);
-        avformat_close_input(&format_ctx);
-        return {};
-    }
-
-    int num_bytes = av_image_get_buffer_size(AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1);
-    uint8_t* buffer = (uint8_t*)av_malloc(num_bytes * sizeof(uint8_t));
-    av_image_fill_arrays(frame_rgb->data, frame_rgb->linesize, buffer, AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1);
-
-    struct SwsContext* sws_ctx = sws_getContext(codec_ctx->width, codec_ctx->height, codec_ctx->pix_fmt,
-                                                codec_ctx->width, codec_ctx->height, AV_PIX_FMT_RGB24,
-                                                SWS_BILINEAR, nullptr, nullptr, nullptr);
-
-    std::vector<clip_image_u8 *> frames;
-
-    AVPacket packet;
-    int64_t last_pts = AV_NOPTS_VALUE;
-    int64_t total_frames = format_ctx->streams[video_stream_index]->nb_frames;
-    // LOG_TEE("total_frames: %lld\n", total_frames);
-
-    int64_t frame_interval = (int64_t)codec_ctx->framerate.num / codec_ctx->framerate.den;
-    // LOG_TEE("frame_interval: %lld\n", frame_interval);
-    // LOG_TEE("codec_ctx->framerate.num: %lld\n", codec_ctx->framerate.num);
-    // LOG_TEE("codec_ctx->framerate.den: %lld\n", codec_ctx->framerate.den);
-
-    float frame_len = 1.0 * total_frames / frame_interval;
-    LOG_TEE("frame_len: %f\n", frame_len);
-    if(frame_len > 15){
-        frame_interval = (int64_t)(1.0 * total_frames / 15);
-    }
-    // LOG_TEE("frame_interval: %lld\n", frame_interval);
-    int frame_idx = 0;
-    while (av_read_frame(format_ctx, &packet) >= 0) {
-        if (packet.stream_index == video_stream_index) {
-            if (avcodec_send_packet(codec_ctx, &packet) == 0) {
-                for(;avcodec_receive_frame(codec_ctx, frame) == 0;frame_idx++) {
-                    // int frame_idx = frame->pts/codec_ctx->framerate.den;
-                    // LOG_TEE("frame_idx: %d %d\n", frame_idx, frame_idx % frame_interval);
-                    if (frame->pts != last_pts && (frame_idx) % frame_interval == 0) {
-                        sws_scale(sws_ctx, frame->data, frame->linesize, 0, codec_ctx->height,
-                                  frame_rgb->data, frame_rgb->linesize);
-
-                        clip_image_u8 * img = clip_image_u8_init();
-                        img->nx = codec_ctx->width;
-                        img->ny = codec_ctx->height;
-                        img->buf.resize(num_bytes);
-                        std::copy(buffer, buffer + num_bytes, img->buf.begin());
-
-                        frames.push_back(img);
-                        last_pts = frame->pts;
-                    }
-                }
-            }
-        }
-        av_packet_unref(&packet);
-    }
-
-    av_free(buffer);
-    av_frame_free(&frame_rgb);
-    av_frame_free(&frame);
-    avcodec_free_context(&codec_ctx);
-    avformat_close_input(&format_ctx);
-    sws_freeContext(sws_ctx);
-
-    return frames;
-}
+// static std::vector<clip_image_u8 *> extract_frames(const std::string& video_path) {
+//     AVFormatContext* format_ctx = nullptr;
+//     if (avformat_open_input(&format_ctx, video_path.c_str(), nullptr, nullptr) < 0) {
+//         LOG_TEE("Could not open video file.");
+//         return {};
+//     }
+
+//     if (avformat_find_stream_info(format_ctx, nullptr) < 0) {
+//         LOG_TEE("Could not find stream information.");
+//         avformat_close_input(&format_ctx);
+//         return {};
+//     }
+
+//     const AVCodec* codec = nullptr;
+//     AVCodecContext* codec_ctx = nullptr;
+//     int video_stream_index = -1;
+
+//     // Find the video stream
+//     for (size_t i = 0; i < format_ctx->nb_streams; ++i) {
+//         if (format_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
+//             codec = avcodec_find_decoder(format_ctx->streams[i]->codecpar->codec_id);
+//             if (codec) {
+//                 video_stream_index = i;
+//                 break;
+//             }
+//         }
+//     }
+
+//     if (video_stream_index == -1) {
+//         LOG_TEE("Could not find video stream.");
+//         avformat_close_input(&format_ctx);
+//         return {};
+//     }
+
+//     codec_ctx = avcodec_alloc_context3(codec);
+//     if (!codec_ctx) {
+//         LOG_TEE("Could not allocate video codec context.");
+//         avformat_close_input(&format_ctx);
+//         return {};
+//     }
+
+//     if (avcodec_parameters_to_context(codec_ctx, format_ctx->streams[video_stream_index]->codecpar) < 0) {
+//         LOG_TEE("Could not copy codec parameters to codec context.");
+//         avcodec_free_context(&codec_ctx);
+//         avformat_close_input(&format_ctx);
+//         return {};
+//     }
+
+//     if (avcodec_open2(codec_ctx, codec, nullptr) < 0) {
+//         LOG_TEE("Could not open codec.");
+//         avcodec_free_context(&codec_ctx);
+//         avformat_close_input(&format_ctx);
+//         return {};
+//     }
+
+//     AVFrame* frame = av_frame_alloc();
+//     AVFrame* frame_rgb = av_frame_alloc();
+//     if (!frame || !frame_rgb) {
+//         LOG_TEE("Could not allocate frames.");
+//         av_frame_free(&frame);
+//         av_frame_free(&frame_rgb);
+//         avcodec_free_context(&codec_ctx);
+//         avformat_close_input(&format_ctx);
+//         return {};
+//     }
+
+//     int num_bytes = av_image_get_buffer_size(AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1);
+//     uint8_t* buffer = (uint8_t*)av_malloc(num_bytes * sizeof(uint8_t));
+//     av_image_fill_arrays(frame_rgb->data, frame_rgb->linesize, buffer, AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1);
+
+//     struct SwsContext* sws_ctx = sws_getContext(codec_ctx->width, codec_ctx->height, codec_ctx->pix_fmt,
+//                                                 codec_ctx->width, codec_ctx->height, AV_PIX_FMT_RGB24,
+//                                                 SWS_BILINEAR, nullptr, nullptr, nullptr);
+
+//     std::vector<clip_image_u8 *> frames;
+
+//     AVPacket packet;
+//     int64_t last_pts = AV_NOPTS_VALUE;
+//     int64_t total_frames = format_ctx->streams[video_stream_index]->nb_frames;
+//     // LOG_TEE("total_frames: %lld\n", total_frames);
+
+//     int64_t frame_interval = (int64_t)codec_ctx->framerate.num / codec_ctx->framerate.den;
+//     // LOG_TEE("frame_interval: %lld\n", frame_interval);
+//     // LOG_TEE("codec_ctx->framerate.num: %lld\n", codec_ctx->framerate.num);
+//     // LOG_TEE("codec_ctx->framerate.den: %lld\n", codec_ctx->framerate.den);
+
+//     float frame_len = 1.0 * total_frames / frame_interval;
+//     LOG_TEE("frame_len: %f\n", frame_len);
+//     if(frame_len > 15){
+//         frame_interval = (int64_t)(1.0 * total_frames / 15);
+//     }
+//     // LOG_TEE("frame_interval: %lld\n", frame_interval);
+//     int frame_idx = 0;
+//     while (av_read_frame(format_ctx, &packet) >= 0) {
+//         if (packet.stream_index == video_stream_index) {
+//             if (avcodec_send_packet(codec_ctx, &packet) == 0) {
+//                 for(;avcodec_receive_frame(codec_ctx, frame) == 0;frame_idx++) {
+//                     // int frame_idx = frame->pts/codec_ctx->framerate.den;
+//                     // LOG_TEE("frame_idx: %d %d\n", frame_idx, frame_idx % frame_interval);
+//                     if (frame->pts != last_pts && (frame_idx) % frame_interval == 0) {
+//                         sws_scale(sws_ctx, frame->data, frame->linesize, 0, codec_ctx->height,
+//                                   frame_rgb->data, frame_rgb->linesize);
+
+//                         clip_image_u8 * img = clip_image_u8_init();
+//                         img->nx = codec_ctx->width;
+//                         img->ny = codec_ctx->height;
+//                         img->buf.resize(num_bytes);
+//                         std::copy(buffer, buffer + num_bytes, img->buf.begin());
+
+//                         frames.push_back(img);
+//                         last_pts = frame->pts;
+//                     }
+//                 }
+//             }
+//         }
+//         av_packet_unref(&packet);
+//     }
+
+//     av_free(buffer);
+//     av_frame_free(&frame_rgb);
+//     av_frame_free(&frame);
+//     avcodec_free_context(&codec_ctx);
+//     avformat_close_input(&format_ctx);
+//     sws_freeContext(sws_ctx);
+
+//     return frames;
+// }
 
 static void show_additional_info(int /*argc*/, char ** argv) {
     LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> [--video <path/to/an/video.mp4>] [--image <path/to/an/image.jpg>] [--image <path/to/another/image.jpg>] [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
@@ -471,39 +471,39 @@ int main(int argc, char ** argv) {
     int n_past = 0;
     struct llava_context * ctx_llava = nullptr;
 
-    if (params.video.size() > 0){
-        ctx_llava = llava_init_context(&params);
-        auto video = params.video;        
-        std::vector<clip_image_u8 *> frames = extract_frames(video.c_str());
-        process_prompt(0, ctx_llava, &params, n_past);
-        // LOG_TEE("frames.size: %zu\n", frames.size());
-        for (size_t i = 0; i < frames.size(); ++i) {
-            auto embeds = video_image_embed(ctx_llava->ctx_clip, &params, frames[i]);
-            process_input(ctx_llava, &params, 1, "", n_past, embeds);
-        }
-        process_input(ctx_llava, &params, 0, params.prompt.c_str(), n_past);
-        process_prompt(2, ctx_llava, &params, n_past);
-
-        struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
-        const int max_tgt_len = params.n_predict < 0 ? 8192 : params.n_predict;
-        std::string response = "";
-        bool have_tmp = false;
-        for (int i = 0; i < max_tgt_len; i++) {
-            auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
-            response += tmp;
-            if (strcmp(tmp, "</s>") == 0){
-                if(!have_tmp)continue;
-                else break;
-            }
-            have_tmp = true;
-            printf("%s", tmp);
-            if (strstr(response.c_str(), "<user>")) break; // minicpm-v 
-
-            fflush(stdout);
-        }
-        llama_sampling_free(ctx_sampling);
-    }
-    else {
+    // if (params.video.size() > 0){
+    //     ctx_llava = llava_init_context(&params);
+    //     auto video = params.video;        
+    //     std::vector<clip_image_u8 *> frames = extract_frames(video.c_str());
+    //     process_prompt(0, ctx_llava, &params, n_past);
+    //     // LOG_TEE("frames.size: %zu\n", frames.size());
+    //     for (size_t i = 0; i < frames.size(); ++i) {
+    //         auto embeds = video_image_embed(ctx_llava->ctx_clip, &params, frames[i]);
+    //         process_input(ctx_llava, &params, 1, "", n_past, embeds);
+    //     }
+    //     process_input(ctx_llava, &params, 0, params.prompt.c_str(), n_past);
+    //     process_prompt(2, ctx_llava, &params, n_past);
+
+    //     struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    //     const int max_tgt_len = params.n_predict < 0 ? 8192 : params.n_predict;
+    //     std::string response = "";
+    //     bool have_tmp = false;
+    //     for (int i = 0; i < max_tgt_len; i++) {
+    //         auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
+    //         response += tmp;
+    //         if (strcmp(tmp, "</s>") == 0){
+    //             if(!have_tmp)continue;
+    //             else break;
+    //         }
+    //         have_tmp = true;
+    //         printf("%s", tmp);
+    //         if (strstr(response.c_str(), "<user>")) break; // minicpm-v 
+
+    //         fflush(stdout);
+    //     }
+    //     llama_sampling_free(ctx_sampling);
+    // }
+    // else {
         if (params.image.size() > 1) {
             ctx_llava = llava_init_context(&params);
             process_prompt(0, ctx_llava, &params, n_past);
@@ -585,7 +585,7 @@ int main(int argc, char ** argv) {
 
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
-    }
+    // }
 
     return 0;
 }
\ No newline at end of file