Skip to content

Commit

Permalink
Merge branch 'the-function-"clip"-should-be-int' into support-video-u…
Browse files Browse the repository at this point in the history
…nderstanding
  • Loading branch information
tc-mb committed Aug 29, 2024
2 parents 41ad342 + 8eefb1e commit 86ad823
Show file tree
Hide file tree
Showing 56 changed files with 8,282 additions and 6,850 deletions.
25 changes: 11 additions & 14 deletions .devops/full-cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
ARG UBUNTU_VERSION=22.04

# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1

ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

FROM ${BASE_CUDA_DEV_CONTAINER} AS build

# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default

RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1

COPY requirements.txt requirements.txt
COPY requirements requirements
Expand All @@ -24,13 +22,12 @@ WORKDIR /app

COPY . .

# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1

RUN make -j$(nproc)
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc) && \
cp build/bin/* .

ENTRYPOINT ["/app/.devops/tools.sh"]
24 changes: 13 additions & 11 deletions .devops/llama-cli-cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,35 +1,37 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

FROM ${BASE_CUDA_DEV_CONTAINER} AS build

# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default

RUN apt-get update && \
apt-get install -y build-essential git
apt-get install -y build-essential git cmake

WORKDIR /app

COPY . .

# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1

RUN make -j$(nproc) llama-cli
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc)

FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

RUN apt-get update && \
apt-get install -y libgomp1

COPY --from=build /app/llama-cli /llama-cli
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-cli /llama-cli

ENTRYPOINT [ "/llama-cli" ]
29 changes: 16 additions & 13 deletions .devops/llama-server-cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,38 +1,41 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

FROM ${BASE_CUDA_DEV_CONTAINER} AS build

# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default

RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev
apt-get install -y build-essential git cmake libcurl4-openssl-dev

WORKDIR /app

COPY . .

# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1

RUN make -j$(nproc) llama-server
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc)

FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl

COPY --from=build /app/llama-server /llama-server
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-server /llama-server

# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0

HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

Expand Down
2 changes: 2 additions & 0 deletions .devops/llama-server-intel.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ RUN apt-get update && \
COPY --from=build /app/build/bin/llama-server /llama-server

ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0

HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

Expand Down
2 changes: 2 additions & 0 deletions .devops/llama-server-rocm.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0

# Enable cURL
ENV LLAMA_CURL=1
Expand Down
2 changes: 2 additions & 0 deletions .devops/llama-server-vulkan.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
rm -rf /app

ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0

HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

Expand Down
2 changes: 2 additions & 0 deletions .devops/llama-server.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ RUN apt-get update && \
COPY --from=build /app/llama-server /llama-server

ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0

HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

Expand Down
2 changes: 1 addition & 1 deletion .ecrc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"Exclude": ["^\\.gitmodules$"],
"Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
"Disable": {
"IndentSize": true
}
Expand Down
15 changes: 3 additions & 12 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,21 +96,12 @@ jobs:
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

- name: Build and push Docker image (versioned)
- name: Build and push Docker image (tagged + versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
uses: docker/build-push-action@v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}

- name: Build and push Docker image (tagged)
uses: docker/build-push-action@v4
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
file: ${{ matrix.config.dockerfile }}
29 changes: 17 additions & 12 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
# # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#

if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
Expand Down Expand Up @@ -40,7 +43,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
fi

if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
fi

if [ ! -z ${GG_BUILD_SYCL} ]; then
Expand All @@ -52,6 +55,10 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then

CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
fi

if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi
## helpers

# download a file if it does not exist or if it is outdated
Expand Down Expand Up @@ -107,7 +114,7 @@ function gg_run_ctest_debug {
gg_check_build_requirements

(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log

(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log

Expand Down Expand Up @@ -138,7 +145,7 @@ function gg_run_ctest_release {
gg_check_build_requirements

(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log

if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
Expand Down Expand Up @@ -266,7 +273,6 @@ function gg_sum_ctest_with_model_release {
}

# open_llama_7b_v2
# requires: GG_BUILD_CUDA

function gg_run_open_llama_7b_v2 {
cd ${SRC}
Expand All @@ -290,8 +296,8 @@ function gg_run_open_llama_7b_v2 {

set -e

(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log

python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

Expand Down Expand Up @@ -425,7 +431,7 @@ function gg_run_pythia_1_4b {
set -e

(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log

python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

Expand Down Expand Up @@ -535,7 +541,6 @@ function gg_sum_pythia_1_4b {
}

# pythia_2_8b
# requires: GG_BUILD_CUDA

function gg_run_pythia_2_8b {
cd ${SRC}
Expand All @@ -556,8 +561,8 @@ function gg_run_pythia_2_8b {

set -e

(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log

python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

Expand Down Expand Up @@ -692,7 +697,7 @@ function gg_run_embd_bge_small {
set -e

(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log

python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

Expand Down Expand Up @@ -761,7 +766,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
fi

if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
test $ret -eq 0 && gg_run pythia_1_4b
else
test $ret -eq 0 && gg_run pythia_2_8b
Expand Down
25 changes: 19 additions & 6 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
void gpt_params_parse_from_env(gpt_params & params) {
// we only care about server-related params for now
get_env("LLAMA_ARG_MODEL", params.model);
get_env("LLAMA_ARG_MODEL_URL", params.model_url);
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
get_env("LLAMA_ARG_THREADS", params.n_threads);
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
Expand All @@ -341,6 +345,9 @@ void gpt_params_parse_from_env(gpt_params & params) {
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching);
get_env("LLAMA_ARG_HOST", params.hostname);
get_env("LLAMA_ARG_PORT", params.port);
}

bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
Expand Down Expand Up @@ -914,7 +921,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
CHECK_ARG
params.n_gpu_layers_draft = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
Expand Down Expand Up @@ -1876,13 +1883,19 @@ std::string string_get_sortable_timestamp() {

void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return; // Avoid infinite loop if 'search' is an empty string
return;
}
std::string builder;
builder.reserve(s.length());
size_t pos = 0;
while ((pos = s.find(search, pos)) != std::string::npos) {
s.replace(pos, search.length(), replace);
pos += replace.length();
}
size_t last_pos = 0;
while ((pos = s.find(search, last_pos)) != std::string::npos) {
builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
}
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}

void string_process_escapes(std::string & input) {
Expand Down
Loading

0 comments on commit 86ad823

Please sign in to comment.