Merge remote-tracking branch 'upstream/master'

# Conflicts: # examples/server/README.md
2023-06-15 17:12:29 -04:00 · 2023-06-15 17:12:29 -04:00 · 488c62acf9
commit 488c62acf9
parent aee859519e a09f9195be
11 changed files with 547 additions and 457 deletions
--- a/.gitignore
+++ b/.gitignore
@ -32,6 +32,7 @@ models/*
 /result
 /perplexity
 /embedding
 /train-text-from-scratch
 /benchmark-matmult
 /vdot
 /server
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -163,12 +163,23 @@ if (LLAMA_BLAS)
    if (BLAS_FOUND)
        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
        # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
        # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
        find_path(BLAS_INCLUDE_DIRS
            NAMES cblas.h
            HINTS
                /usr/include
                /usr/local/include
                /usr/include/openblas
        )
        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
        add_compile_options(${BLAS_LINKER_FLAGS})
        add_compile_definitions(GGML_USE_OPENBLAS)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
        message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
        include_directories(${BLAS_INCLUDE_DIRS})
    else()
        message(WARNING "BLAS not found, please refer to "
        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@ -408,7 +419,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_EXTRA}
            )
-target_include_directories(ggml PUBLIC .)
+target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
--- a/7
+++ b/7
@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch
 ifdef LLAMA_BUILD_SERVER
 	BUILD_TARGETS += server
@ -261,7 +261,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
+	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h
 #
 # Examples
@ -291,6 +291,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
--- a/Package.swift
+++ b/Package.swift
@ -11,6 +11,7 @@ let package = Package(
        .target(
            name: "llama",
            path: ".",
            exclude: ["ggml-metal.metal"],
            sources: ["ggml.c", "llama.cpp"],
            publicHeadersPath: "spm-headers",
            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@ -0,0 +1,41 @@
 #!/bin/bash
 set -e
 cd "$(dirname "$0")/.." || exit
 MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
 PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
 USER_NAME="### Human"
 AI_NAME="### Assistant"
 # Adjust to the number of CPU cores you want to use.
 N_THREAD="${N_THREAD:-8}"
 # Number of tokens to predict (made it larger than default because we want a long interaction)
 N_PREDICTS="${N_PREDICTS:-2048}"
 # Note: you can also override the generation options by specifying them on the command line:
 # For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
 GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
 DATE_TIME=$(date +%H:%M)
 DATE_YEAR=$(date +%Y)
 PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
 sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
     $PROMPT_TEMPLATE > $PROMPT_FILE
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
 ./bin/main $GEN_OPTIONS \
  --model "$MODEL" \
  --threads "$N_THREAD" \
  --n_predict "$N_PREDICTS" \
  --color --interactive \
  --file ${PROMPT_FILE} \
  --reverse-prompt "### Human:" \
  --in-prefix ' ' \
  "$@"
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -412,6 +412,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }
 #ifdef GGML_USE_CUBLAS
    if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
        fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
        exit(1);
    }
 #endif // GGML_USE_CUBLAS
    if (escape_prompt) {
        process_escapes(params.prompt);
    }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -25,7 +25,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
        }                                                                               \
    } while (0)
-#if CUDART_VERSION >= 12
+#if CUDART_VERSION >= 12000
 #define CUBLAS_CHECK(err)                                                               \
    do {                                                                                \
        cublasStatus_t err_ = (err);                                                    \
@ -2366,7 +2366,7 @@ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
 }
 void ggml_cuda_set_main_device(int main_device) {
-    if (main_device > g_device_count) {
+    if (main_device >= g_device_count) {
        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
                main_device, g_device_count, g_main_device);
        return;
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -55,6 +55,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
 void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 #ifdef __cplusplus
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -287,15 +287,40 @@ void ggml_metal_graph_compute(
               struct ggml_cgraph * gf) {
    metal_printf("%s: evaluating graph\n", __func__);
    // create multiple command buffers and enqueue them
    // then, we encode the graph into the command buffers in parallel
    const int n_cb = gf->n_threads;
    NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
    for (int i = 0; i < n_cb; ++i) {
        command_buffers[i] = [ctx->queue commandBuffer];
        // enqueue the command buffers in order to specify their execution order
        [command_buffers[i] enqueue];
    }
    // TODO: is this the best way to start threads?
    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
        const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
        dispatch_async(queue, ^{
            size_t offs_src0 = 0;
            size_t offs_src1 = 0;
            size_t offs_dst  = 0;
-    id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
+            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
            id<MTLComputeCommandEncoder> encoder = nil;
-    for (int i = 0; i < gf->n_nodes; ++i) {
+            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-        //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+            const int node_end   = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
            for (int i = node_start; i < node_end; ++i) {
                metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
                struct ggml_tensor * src0 = gf->nodes[i]->src0;
                struct ggml_tensor * src1 = gf->nodes[i]->src1;
@ -626,7 +651,6 @@ void ggml_metal_graph_compute(
                                        }
                                };
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@ -800,12 +824,11 @@ void ggml_metal_graph_compute(
            }
            [command_buffer commit];
-    [command_buffer waitUntilCompleted];
+        });
    {
        const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
        UNUSED(time_elapsed);
        metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
    }
    // wait for all threads to finish
    dispatch_barrier_sync(queue, ^{});
    [command_buffers[n_cb - 1] waitUntilCompleted];
 }
--- a/llama.h
+++ b/llama.h
@ -244,9 +244,9 @@ extern "C" {
    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
    // Special tokens
-    LLAMA_API llama_token llama_token_bos();
+    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos();
+    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
-    LLAMA_API llama_token llama_token_nl();
+    LLAMA_API llama_token llama_token_nl();   // next-line
    // Sampling functions
--- a/spm-headers/ggml.h
+++ b/spm-headers/ggml.h
@ -0,0 +1 @@
 ../ggml.h