From b63ad9a5d28fd3d1e8003ffa48b157000716538b Mon Sep 17 00:00:00 2001 From: caitianchi Date: Thu, 8 Aug 2024 11:30:58 +0800 Subject: [PATCH] init --- Makefile | 20 +- examples/llava/minicpmv-cli.cpp | 334 ++++++++++++++++---------------- 2 files changed, 180 insertions(+), 174 deletions(-) diff --git a/Makefile b/Makefile index a157b1e1ceef8e..e70564603b1ff9 100644 --- a/Makefile +++ b/Makefile @@ -950,15 +950,21 @@ llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/lla $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) -FFMPEG_CFLAGS := $(shell pkg-config --cflags libavformat libavcodec libavutil) -FFMPEG_LIBS := $(shell pkg-config --libs libavformat libavcodec libavutil) -lswscale +# FFMPEG_CFLAGS := $(shell pkg-config --cflags libavformat libavcodec libavutil) +# FFMPEG_LIBS := $(shell pkg-config --libs libavformat libavcodec libavutil) -lswscale + +# llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +# $(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) +# $(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual +# $(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) +# $(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) $(FFMPEG_LIBS) llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual - $(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) - $(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) $(FFMPEG_LIBS) - + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) + llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 6f46c2a1419b52..809e5c9ecd23c6 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -9,12 +9,12 @@ #include #include -extern "C" { - #include - #include - #include - #include -} +// extern "C" { +// #include +// #include +// #include +// #include +// } struct llava_context { struct clip_ctx * ctx_clip = NULL; @@ -28,133 +28,133 @@ struct clip_image_u8 { std::vector buf; }; -static std::vector extract_frames(const std::string& video_path) { - AVFormatContext* format_ctx = nullptr; - if (avformat_open_input(&format_ctx, video_path.c_str(), nullptr, nullptr) < 0) { - LOG_TEE("Could not open video file."); - return {}; - } - - if (avformat_find_stream_info(format_ctx, nullptr) < 0) { - LOG_TEE("Could not find stream information."); - avformat_close_input(&format_ctx); - return {}; - } - - const AVCodec* codec = nullptr; - AVCodecContext* codec_ctx = nullptr; - int video_stream_index = -1; - - // Find the video stream - for (size_t i = 0; i < format_ctx->nb_streams; ++i) { - if (format_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) { - codec = avcodec_find_decoder(format_ctx->streams[i]->codecpar->codec_id); - if (codec) { - video_stream_index = i; - break; - } - } - } - - if (video_stream_index == -1) { - LOG_TEE("Could not find video stream."); - avformat_close_input(&format_ctx); - return {}; - } - - codec_ctx = avcodec_alloc_context3(codec); - if (!codec_ctx) { - LOG_TEE("Could not allocate video codec context."); - avformat_close_input(&format_ctx); - return {}; - } - - if (avcodec_parameters_to_context(codec_ctx, format_ctx->streams[video_stream_index]->codecpar) < 0) { - LOG_TEE("Could not copy codec parameters to codec context."); - avcodec_free_context(&codec_ctx); - avformat_close_input(&format_ctx); - return {}; - } - - if (avcodec_open2(codec_ctx, codec, nullptr) < 0) { - LOG_TEE("Could not open codec."); - avcodec_free_context(&codec_ctx); - avformat_close_input(&format_ctx); - return {}; - } - - AVFrame* frame = av_frame_alloc(); - AVFrame* frame_rgb = av_frame_alloc(); - if (!frame || !frame_rgb) { - LOG_TEE("Could not allocate frames."); - av_frame_free(&frame); - av_frame_free(&frame_rgb); - avcodec_free_context(&codec_ctx); - avformat_close_input(&format_ctx); - return {}; - } - - int num_bytes = av_image_get_buffer_size(AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1); - uint8_t* buffer = (uint8_t*)av_malloc(num_bytes * sizeof(uint8_t)); - av_image_fill_arrays(frame_rgb->data, frame_rgb->linesize, buffer, AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1); - - struct SwsContext* sws_ctx = sws_getContext(codec_ctx->width, codec_ctx->height, codec_ctx->pix_fmt, - codec_ctx->width, codec_ctx->height, AV_PIX_FMT_RGB24, - SWS_BILINEAR, nullptr, nullptr, nullptr); - - std::vector frames; - - AVPacket packet; - int64_t last_pts = AV_NOPTS_VALUE; - int64_t total_frames = format_ctx->streams[video_stream_index]->nb_frames; - // LOG_TEE("total_frames: %lld\n", total_frames); - - int64_t frame_interval = (int64_t)codec_ctx->framerate.num / codec_ctx->framerate.den; - // LOG_TEE("frame_interval: %lld\n", frame_interval); - // LOG_TEE("codec_ctx->framerate.num: %lld\n", codec_ctx->framerate.num); - // LOG_TEE("codec_ctx->framerate.den: %lld\n", codec_ctx->framerate.den); - - float frame_len = 1.0 * total_frames / frame_interval; - LOG_TEE("frame_len: %f\n", frame_len); - if(frame_len > 15){ - frame_interval = (int64_t)(1.0 * total_frames / 15); - } - // LOG_TEE("frame_interval: %lld\n", frame_interval); - int frame_idx = 0; - while (av_read_frame(format_ctx, &packet) >= 0) { - if (packet.stream_index == video_stream_index) { - if (avcodec_send_packet(codec_ctx, &packet) == 0) { - for(;avcodec_receive_frame(codec_ctx, frame) == 0;frame_idx++) { - // int frame_idx = frame->pts/codec_ctx->framerate.den; - // LOG_TEE("frame_idx: %d %d\n", frame_idx, frame_idx % frame_interval); - if (frame->pts != last_pts && (frame_idx) % frame_interval == 0) { - sws_scale(sws_ctx, frame->data, frame->linesize, 0, codec_ctx->height, - frame_rgb->data, frame_rgb->linesize); - - clip_image_u8 * img = clip_image_u8_init(); - img->nx = codec_ctx->width; - img->ny = codec_ctx->height; - img->buf.resize(num_bytes); - std::copy(buffer, buffer + num_bytes, img->buf.begin()); - - frames.push_back(img); - last_pts = frame->pts; - } - } - } - } - av_packet_unref(&packet); - } - - av_free(buffer); - av_frame_free(&frame_rgb); - av_frame_free(&frame); - avcodec_free_context(&codec_ctx); - avformat_close_input(&format_ctx); - sws_freeContext(sws_ctx); - - return frames; -} +// static std::vector extract_frames(const std::string& video_path) { +// AVFormatContext* format_ctx = nullptr; +// if (avformat_open_input(&format_ctx, video_path.c_str(), nullptr, nullptr) < 0) { +// LOG_TEE("Could not open video file."); +// return {}; +// } + +// if (avformat_find_stream_info(format_ctx, nullptr) < 0) { +// LOG_TEE("Could not find stream information."); +// avformat_close_input(&format_ctx); +// return {}; +// } + +// const AVCodec* codec = nullptr; +// AVCodecContext* codec_ctx = nullptr; +// int video_stream_index = -1; + +// // Find the video stream +// for (size_t i = 0; i < format_ctx->nb_streams; ++i) { +// if (format_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) { +// codec = avcodec_find_decoder(format_ctx->streams[i]->codecpar->codec_id); +// if (codec) { +// video_stream_index = i; +// break; +// } +// } +// } + +// if (video_stream_index == -1) { +// LOG_TEE("Could not find video stream."); +// avformat_close_input(&format_ctx); +// return {}; +// } + +// codec_ctx = avcodec_alloc_context3(codec); +// if (!codec_ctx) { +// LOG_TEE("Could not allocate video codec context."); +// avformat_close_input(&format_ctx); +// return {}; +// } + +// if (avcodec_parameters_to_context(codec_ctx, format_ctx->streams[video_stream_index]->codecpar) < 0) { +// LOG_TEE("Could not copy codec parameters to codec context."); +// avcodec_free_context(&codec_ctx); +// avformat_close_input(&format_ctx); +// return {}; +// } + +// if (avcodec_open2(codec_ctx, codec, nullptr) < 0) { +// LOG_TEE("Could not open codec."); +// avcodec_free_context(&codec_ctx); +// avformat_close_input(&format_ctx); +// return {}; +// } + +// AVFrame* frame = av_frame_alloc(); +// AVFrame* frame_rgb = av_frame_alloc(); +// if (!frame || !frame_rgb) { +// LOG_TEE("Could not allocate frames."); +// av_frame_free(&frame); +// av_frame_free(&frame_rgb); +// avcodec_free_context(&codec_ctx); +// avformat_close_input(&format_ctx); +// return {}; +// } + +// int num_bytes = av_image_get_buffer_size(AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1); +// uint8_t* buffer = (uint8_t*)av_malloc(num_bytes * sizeof(uint8_t)); +// av_image_fill_arrays(frame_rgb->data, frame_rgb->linesize, buffer, AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1); + +// struct SwsContext* sws_ctx = sws_getContext(codec_ctx->width, codec_ctx->height, codec_ctx->pix_fmt, +// codec_ctx->width, codec_ctx->height, AV_PIX_FMT_RGB24, +// SWS_BILINEAR, nullptr, nullptr, nullptr); + +// std::vector frames; + +// AVPacket packet; +// int64_t last_pts = AV_NOPTS_VALUE; +// int64_t total_frames = format_ctx->streams[video_stream_index]->nb_frames; +// // LOG_TEE("total_frames: %lld\n", total_frames); + +// int64_t frame_interval = (int64_t)codec_ctx->framerate.num / codec_ctx->framerate.den; +// // LOG_TEE("frame_interval: %lld\n", frame_interval); +// // LOG_TEE("codec_ctx->framerate.num: %lld\n", codec_ctx->framerate.num); +// // LOG_TEE("codec_ctx->framerate.den: %lld\n", codec_ctx->framerate.den); + +// float frame_len = 1.0 * total_frames / frame_interval; +// LOG_TEE("frame_len: %f\n", frame_len); +// if(frame_len > 15){ +// frame_interval = (int64_t)(1.0 * total_frames / 15); +// } +// // LOG_TEE("frame_interval: %lld\n", frame_interval); +// int frame_idx = 0; +// while (av_read_frame(format_ctx, &packet) >= 0) { +// if (packet.stream_index == video_stream_index) { +// if (avcodec_send_packet(codec_ctx, &packet) == 0) { +// for(;avcodec_receive_frame(codec_ctx, frame) == 0;frame_idx++) { +// // int frame_idx = frame->pts/codec_ctx->framerate.den; +// // LOG_TEE("frame_idx: %d %d\n", frame_idx, frame_idx % frame_interval); +// if (frame->pts != last_pts && (frame_idx) % frame_interval == 0) { +// sws_scale(sws_ctx, frame->data, frame->linesize, 0, codec_ctx->height, +// frame_rgb->data, frame_rgb->linesize); + +// clip_image_u8 * img = clip_image_u8_init(); +// img->nx = codec_ctx->width; +// img->ny = codec_ctx->height; +// img->buf.resize(num_bytes); +// std::copy(buffer, buffer + num_bytes, img->buf.begin()); + +// frames.push_back(img); +// last_pts = frame->pts; +// } +// } +// } +// } +// av_packet_unref(&packet); +// } + +// av_free(buffer); +// av_frame_free(&frame_rgb); +// av_frame_free(&frame); +// avcodec_free_context(&codec_ctx); +// avformat_close_input(&format_ctx); +// sws_freeContext(sws_ctx); + +// return frames; +// } static void show_additional_info(int /*argc*/, char ** argv) { LOG_TEE("\n example usage: %s -m --mmproj [--video ] [--image ] [--image ] [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); @@ -471,39 +471,39 @@ int main(int argc, char ** argv) { int n_past = 0; struct llava_context * ctx_llava = nullptr; - if (params.video.size() > 0){ - ctx_llava = llava_init_context(¶ms); - auto video = params.video; - std::vector frames = extract_frames(video.c_str()); - process_prompt(0, ctx_llava, ¶ms, n_past); - // LOG_TEE("frames.size: %zu\n", frames.size()); - for (size_t i = 0; i < frames.size(); ++i) { - auto embeds = video_image_embed(ctx_llava->ctx_clip, ¶ms, frames[i]); - process_input(ctx_llava, ¶ms, 1, "", n_past, embeds); - } - process_input(ctx_llava, ¶ms, 0, params.prompt.c_str(), n_past); - process_prompt(2, ctx_llava, ¶ms, n_past); - - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); - const int max_tgt_len = params.n_predict < 0 ? 8192 : params.n_predict; - std::string response = ""; - bool have_tmp = false; - for (int i = 0; i < max_tgt_len; i++) { - auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); - response += tmp; - if (strcmp(tmp, "") == 0){ - if(!have_tmp)continue; - else break; - } - have_tmp = true; - printf("%s", tmp); - if (strstr(response.c_str(), "")) break; // minicpm-v - - fflush(stdout); - } - llama_sampling_free(ctx_sampling); - } - else { + // if (params.video.size() > 0){ + // ctx_llava = llava_init_context(¶ms); + // auto video = params.video; + // std::vector frames = extract_frames(video.c_str()); + // process_prompt(0, ctx_llava, ¶ms, n_past); + // // LOG_TEE("frames.size: %zu\n", frames.size()); + // for (size_t i = 0; i < frames.size(); ++i) { + // auto embeds = video_image_embed(ctx_llava->ctx_clip, ¶ms, frames[i]); + // process_input(ctx_llava, ¶ms, 1, "", n_past, embeds); + // } + // process_input(ctx_llava, ¶ms, 0, params.prompt.c_str(), n_past); + // process_prompt(2, ctx_llava, ¶ms, n_past); + + // struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); + // const int max_tgt_len = params.n_predict < 0 ? 8192 : params.n_predict; + // std::string response = ""; + // bool have_tmp = false; + // for (int i = 0; i < max_tgt_len; i++) { + // auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + // response += tmp; + // if (strcmp(tmp, "") == 0){ + // if(!have_tmp)continue; + // else break; + // } + // have_tmp = true; + // printf("%s", tmp); + // if (strstr(response.c_str(), "")) break; // minicpm-v + + // fflush(stdout); + // } + // llama_sampling_free(ctx_sampling); + // } + // else { if (params.image.size() > 1) { ctx_llava = llava_init_context(¶ms); process_prompt(0, ctx_llava, ¶ms, n_past); @@ -585,7 +585,7 @@ int main(int argc, char ** argv) { ctx_llava->model = NULL; llava_free(ctx_llava); - } + // } return 0; } \ No newline at end of file