Merge remote-tracking branch 'upstream/master' into t5-clean-3
This commit is contained in:
commit
7293243d4f
362 changed files with 2733 additions and 2068 deletions
|
@ -39,13 +39,13 @@ else()
|
|||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(retrieval)
|
||||
if (LLAMA_RPC)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
if (LLAMA_SYCL)
|
||||
if (GGML_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
endif()
|
||||
add_subdirectory(save-load-state)
|
||||
|
|
|
@ -25,7 +25,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
|||
## Example
|
||||
|
||||
```bash
|
||||
LLAMA_CUDA=1 make -j
|
||||
GGML_CUDA=1 make -j
|
||||
|
||||
# generate importance matrix (imatrix.dat)
|
||||
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
#include <string>
|
||||
#include <unistd.h>
|
||||
#include "llama.h"
|
||||
#include "common/common.h"
|
||||
#include "common.h"
|
||||
|
||||
// Write C++ code here.
|
||||
//
|
||||
|
|
|
@ -194,7 +194,7 @@ llama_print_timings: total time = 44411.01 ms / 377 tokens
|
|||
## Orin compile and run
|
||||
### compile
|
||||
```sh
|
||||
make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
||||
make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
|
||||
```
|
||||
### run on Orin
|
||||
### case 1
|
||||
|
|
|
@ -1121,20 +1121,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
}
|
||||
if (n < 32)
|
||||
hparams.image_grid_pinpoints[n] = 0;
|
||||
} catch (std::runtime_error & e) {
|
||||
} catch (std::runtime_error & /*e*/) {
|
||||
hparams.image_grid_pinpoints[0]=0;
|
||||
}
|
||||
|
||||
try {
|
||||
int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
|
||||
strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
|
||||
} catch (std::runtime_error & e) {
|
||||
} catch (std::runtime_error & /*e*/) {
|
||||
strcpy(hparams.mm_patch_merge_type, "flat");
|
||||
}
|
||||
|
||||
try {
|
||||
hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
|
||||
} catch(const std::exception& e) {
|
||||
} catch(const std::exception& /*e*/) {
|
||||
hparams.image_crop_resolution = hparams.image_size;
|
||||
}
|
||||
|
||||
|
@ -1173,7 +1173,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
try {
|
||||
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
||||
new_clip->has_class_embedding = true;
|
||||
} catch (const std::exception& e) {
|
||||
} catch (const std::exception& /*e*/) {
|
||||
new_clip->has_class_embedding = false;
|
||||
}
|
||||
|
||||
|
@ -1181,7 +1181,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||
new_clip->has_pre_norm = true;
|
||||
} catch (std::exception & e) {
|
||||
} catch (std::exception & /*e*/) {
|
||||
new_clip->has_pre_norm = false;
|
||||
}
|
||||
|
||||
|
@ -1189,21 +1189,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
|
||||
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
|
||||
new_clip->has_post_norm = true;
|
||||
} catch (std::exception & e) {
|
||||
} catch (std::exception & /*e*/) {
|
||||
new_clip->has_post_norm = false;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
|
||||
new_clip->has_patch_bias = true;
|
||||
} catch (std::exception & e) {
|
||||
} catch (std::exception & /*e*/) {
|
||||
new_clip->has_patch_bias = false;
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
} catch(const std::exception& e) {
|
||||
} catch(const std::exception& /*e*/) {
|
||||
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
|
||||
}
|
||||
|
||||
|
@ -1215,26 +1215,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
// Yi-type llava
|
||||
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
try {
|
||||
// missing in Yi-type llava
|
||||
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
try {
|
||||
// Yi-type llava
|
||||
vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
|
||||
vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
try {
|
||||
// Yi-type llava
|
||||
vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
|
||||
vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
try {
|
||||
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
||||
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
||||
} catch (std::runtime_error & e) { }
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projection
|
||||
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
||||
|
|
|
@ -29,13 +29,13 @@ You can also run multiple `rpc-server` instances on the same host, each with a d
|
|||
|
||||
## Usage
|
||||
|
||||
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
|
||||
On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options.
|
||||
For example, to build the CUDA backend with RPC support:
|
||||
|
||||
```bash
|
||||
mkdir build-rpc-cuda
|
||||
cd build-rpc-cuda
|
||||
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
|
||||
cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
|
@ -58,12 +58,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
|
|||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||
|
||||
|
||||
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
|
||||
On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
|
||||
|
||||
```bash
|
||||
mkdir build-rpc
|
||||
cd build-rpc
|
||||
cmake .. -DLLAMA_RPC=ON
|
||||
cmake .. -DGGML_RPC=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
|
|
|
@ -1,7 +1,14 @@
|
|||
set(TARGET llama-server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
||||
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
if (MINGW)
|
||||
# fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
|
||||
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
||||
endif()
|
||||
|
||||
set(TARGET_SRCS
|
||||
server.cpp
|
||||
utils.hpp
|
||||
|
@ -24,6 +31,7 @@ set(PUBLIC_ASSETS
|
|||
prompt-formats.js
|
||||
json-schema-to-grammar.mjs
|
||||
)
|
||||
|
||||
foreach(asset ${PUBLIC_ASSETS})
|
||||
set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
|
||||
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
|
||||
|
@ -34,18 +42,23 @@ foreach(asset ${PUBLIC_ASSETS})
|
|||
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
|
||||
)
|
||||
endforeach()
|
||||
|
||||
add_executable(${TARGET} ${TARGET_SRCS})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if (LLAMA_SERVER_SSL)
|
||||
find_package(OpenSSL REQUIRED)
|
||||
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
||||
target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||
endif()
|
||||
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -8,10 +8,10 @@ cd build
|
|||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
||||
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main
|
||||
#cmake --build . --config Release --target main
|
||||
|
|
|
@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
|
|||
|
||||
:: for FP16
|
||||
:: faster for long-prompt inference
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
:: cmake -G "MinGW Makefiles" .. -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||
|
||||
:: for FP32
|
||||
cmake -G "Ninja" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -G "Ninja" .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||
if %errorlevel% neq 0 goto ERROR
|
||||
:: build example/main only
|
||||
:: make main
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue