Merge remote-tracking branch 'upstream/master'
# Conflicts: # examples/server/README.md
This commit is contained in:
commit
488c62acf9
11 changed files with 547 additions and 457 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -32,6 +32,7 @@ models/*
|
||||||
/result
|
/result
|
||||||
/perplexity
|
/perplexity
|
||||||
/embedding
|
/embedding
|
||||||
|
/train-text-from-scratch
|
||||||
/benchmark-matmult
|
/benchmark-matmult
|
||||||
/vdot
|
/vdot
|
||||||
/server
|
/server
|
||||||
|
|
|
@ -163,12 +163,23 @@ if (LLAMA_BLAS)
|
||||||
if (BLAS_FOUND)
|
if (BLAS_FOUND)
|
||||||
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
||||||
|
|
||||||
|
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
|
||||||
|
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
||||||
|
find_path(BLAS_INCLUDE_DIRS
|
||||||
|
NAMES cblas.h
|
||||||
|
HINTS
|
||||||
|
/usr/include
|
||||||
|
/usr/local/include
|
||||||
|
/usr/include/openblas
|
||||||
|
)
|
||||||
|
|
||||||
|
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
|
||||||
|
|
||||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
add_compile_definitions(GGML_USE_OPENBLAS)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||||
|
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
||||||
|
|
||||||
message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
|
|
||||||
include_directories(${BLAS_INCLUDE_DIRS})
|
|
||||||
else()
|
else()
|
||||||
message(WARNING "BLAS not found, please refer to "
|
message(WARNING "BLAS not found, please refer to "
|
||||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||||
|
@ -408,7 +419,7 @@ add_library(ggml OBJECT
|
||||||
${GGML_SOURCES_EXTRA}
|
${GGML_SOURCES_EXTRA}
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(ggml PUBLIC .)
|
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||||
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||||
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||||
|
|
||||||
|
|
7
Makefile
7
Makefile
|
@ -1,5 +1,5 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch
|
||||||
|
|
||||||
ifdef LLAMA_BUILD_SERVER
|
ifdef LLAMA_BUILD_SERVER
|
||||||
BUILD_TARGETS += server
|
BUILD_TARGETS += server
|
||||||
|
@ -261,7 +261,7 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
|
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h
|
||||||
|
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
|
@ -291,6 +291,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
|
||||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||||
@sh scripts/build-info.sh > $@.tmp
|
@sh scripts/build-info.sh > $@.tmp
|
||||||
@if ! cmp -s $@.tmp $@; then \
|
@if ! cmp -s $@.tmp $@; then \
|
||||||
|
|
|
@ -11,6 +11,7 @@ let package = Package(
|
||||||
.target(
|
.target(
|
||||||
name: "llama",
|
name: "llama",
|
||||||
path: ".",
|
path: ".",
|
||||||
|
exclude: ["ggml-metal.metal"],
|
||||||
sources: ["ggml.c", "llama.cpp"],
|
sources: ["ggml.c", "llama.cpp"],
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
|
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
|
||||||
|
|
41
examples/chat-vicuna.sh
Executable file
41
examples/chat-vicuna.sh
Executable file
|
@ -0,0 +1,41 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
cd "$(dirname "$0")/.." || exit
|
||||||
|
|
||||||
|
MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
|
||||||
|
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
|
||||||
|
USER_NAME="### Human"
|
||||||
|
AI_NAME="### Assistant"
|
||||||
|
|
||||||
|
# Adjust to the number of CPU cores you want to use.
|
||||||
|
N_THREAD="${N_THREAD:-8}"
|
||||||
|
# Number of tokens to predict (made it larger than default because we want a long interaction)
|
||||||
|
N_PREDICTS="${N_PREDICTS:-2048}"
|
||||||
|
|
||||||
|
# Note: you can also override the generation options by specifying them on the command line:
|
||||||
|
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
|
||||||
|
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
|
||||||
|
|
||||||
|
DATE_TIME=$(date +%H:%M)
|
||||||
|
DATE_YEAR=$(date +%Y)
|
||||||
|
|
||||||
|
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
|
||||||
|
|
||||||
|
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
|
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
|
||||||
|
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
|
||||||
|
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
|
||||||
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
|
./bin/main $GEN_OPTIONS \
|
||||||
|
--model "$MODEL" \
|
||||||
|
--threads "$N_THREAD" \
|
||||||
|
--n_predict "$N_PREDICTS" \
|
||||||
|
--color --interactive \
|
||||||
|
--file ${PROMPT_FILE} \
|
||||||
|
--reverse-prompt "### Human:" \
|
||||||
|
--in-prefix ' ' \
|
||||||
|
"$@"
|
|
@ -412,6 +412,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
gpt_print_usage(argc, argv, default_params);
|
gpt_print_usage(argc, argv, default_params);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
|
||||||
|
fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
if (escape_prompt) {
|
if (escape_prompt) {
|
||||||
process_escapes(params.prompt);
|
process_escapes(params.prompt);
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,7 +25,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#if CUDART_VERSION >= 12
|
#if CUDART_VERSION >= 12000
|
||||||
#define CUBLAS_CHECK(err) \
|
#define CUBLAS_CHECK(err) \
|
||||||
do { \
|
do { \
|
||||||
cublasStatus_t err_ = (err); \
|
cublasStatus_t err_ = (err); \
|
||||||
|
@ -2366,7 +2366,7 @@ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_set_main_device(int main_device) {
|
void ggml_cuda_set_main_device(int main_device) {
|
||||||
if (main_device > g_device_count) {
|
if (main_device >= g_device_count) {
|
||||||
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
||||||
main_device, g_device_count, g_main_device);
|
main_device, g_device_count, g_main_device);
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -55,6 +55,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
|
||||||
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||||
|
|
||||||
// same as ggml_graph_compute but uses Metal
|
// same as ggml_graph_compute but uses Metal
|
||||||
|
// creates gf->n_threads command buffers in parallel
|
||||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
45
ggml-metal.m
45
ggml-metal.m
|
@ -287,15 +287,40 @@ void ggml_metal_graph_compute(
|
||||||
struct ggml_cgraph * gf) {
|
struct ggml_cgraph * gf) {
|
||||||
metal_printf("%s: evaluating graph\n", __func__);
|
metal_printf("%s: evaluating graph\n", __func__);
|
||||||
|
|
||||||
|
// create multiple command buffers and enqueue them
|
||||||
|
// then, we encode the graph into the command buffers in parallel
|
||||||
|
|
||||||
|
const int n_cb = gf->n_threads;
|
||||||
|
|
||||||
|
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
||||||
|
|
||||||
|
for (int i = 0; i < n_cb; ++i) {
|
||||||
|
command_buffers[i] = [ctx->queue commandBuffer];
|
||||||
|
|
||||||
|
// enqueue the command buffers in order to specify their execution order
|
||||||
|
[command_buffers[i] enqueue];
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: is this the best way to start threads?
|
||||||
|
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
||||||
|
|
||||||
|
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||||
|
const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
|
||||||
|
|
||||||
|
dispatch_async(queue, ^{
|
||||||
size_t offs_src0 = 0;
|
size_t offs_src0 = 0;
|
||||||
size_t offs_src1 = 0;
|
size_t offs_src1 = 0;
|
||||||
size_t offs_dst = 0;
|
size_t offs_dst = 0;
|
||||||
|
|
||||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
|
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
|
||||||
|
|
||||||
id<MTLComputeCommandEncoder> encoder = nil;
|
id<MTLComputeCommandEncoder> encoder = nil;
|
||||||
|
|
||||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||||
//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
|
||||||
|
|
||||||
|
for (int i = node_start; i < node_end; ++i) {
|
||||||
|
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||||
|
|
||||||
struct ggml_tensor * src0 = gf->nodes[i]->src0;
|
struct ggml_tensor * src0 = gf->nodes[i]->src0;
|
||||||
struct ggml_tensor * src1 = gf->nodes[i]->src1;
|
struct ggml_tensor * src1 = gf->nodes[i]->src1;
|
||||||
|
@ -626,7 +651,6 @@ void ggml_metal_graph_compute(
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
|
@ -800,12 +824,11 @@ void ggml_metal_graph_compute(
|
||||||
}
|
}
|
||||||
|
|
||||||
[command_buffer commit];
|
[command_buffer commit];
|
||||||
[command_buffer waitUntilCompleted];
|
});
|
||||||
|
|
||||||
{
|
|
||||||
const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
|
|
||||||
UNUSED(time_elapsed);
|
|
||||||
|
|
||||||
metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// wait for all threads to finish
|
||||||
|
dispatch_barrier_sync(queue, ^{});
|
||||||
|
|
||||||
|
[command_buffers[n_cb - 1] waitUntilCompleted];
|
||||||
}
|
}
|
||||||
|
|
6
llama.h
6
llama.h
|
@ -244,9 +244,9 @@ extern "C" {
|
||||||
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
||||||
|
|
||||||
// Special tokens
|
// Special tokens
|
||||||
LLAMA_API llama_token llama_token_bos();
|
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
||||||
LLAMA_API llama_token llama_token_eos();
|
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
||||||
LLAMA_API llama_token llama_token_nl();
|
LLAMA_API llama_token llama_token_nl(); // next-line
|
||||||
|
|
||||||
// Sampling functions
|
// Sampling functions
|
||||||
|
|
||||||
|
|
1
spm-headers/ggml.h
Symbolic link
1
spm-headers/ggml.h
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../ggml.h
|
Loading…
Add table
Add a link
Reference in a new issue