Merge branch 'master' into concedo_experimental

# Conflicts: # CMakeLists.txt # Makefile
2023-06-16 16:37:14 +08:00 · 2023-06-16 16:37:14 +08:00 · 7ef8d740b9
commit 7ef8d740b9
parent ae88eec40b a09f9195be
10 changed files with 532 additions and 452 deletions
--- a/.gitignore
+++ b/.gitignore
@ -29,6 +29,7 @@ build-sanitize-thread/
 /result
 /perplexity
 /embedding
 /train-text-from-scratch
 /benchmark-matmult
 /vdot
 /Pipfile
--- a/Package.swift
+++ b/Package.swift
@ -11,6 +11,7 @@ let package = Package(
        .target(
            name: "llama",
            path: ".",
            exclude: ["ggml-metal.metal"],
            sources: ["ggml.c", "llama.cpp"],
            publicHeadersPath: "spm-headers",
            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@ -0,0 +1,41 @@
 #!/bin/bash
 set -e
 cd "$(dirname "$0")/.." || exit
 MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
 PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
 USER_NAME="### Human"
 AI_NAME="### Assistant"
 # Adjust to the number of CPU cores you want to use.
 N_THREAD="${N_THREAD:-8}"
 # Number of tokens to predict (made it larger than default because we want a long interaction)
 N_PREDICTS="${N_PREDICTS:-2048}"
 # Note: you can also override the generation options by specifying them on the command line:
 # For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
 GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
 DATE_TIME=$(date +%H:%M)
 DATE_YEAR=$(date +%Y)
 PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
 sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
     $PROMPT_TEMPLATE > $PROMPT_FILE
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
 ./bin/main $GEN_OPTIONS \
  --model "$MODEL" \
  --threads "$N_THREAD" \
  --n_predict "$N_PREDICTS" \
  --color --interactive \
  --file ${PROMPT_FILE} \
  --reverse-prompt "### Human:" \
  --in-prefix ' ' \
  "$@"
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -412,6 +412,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }
 #ifdef GGML_USE_CUBLAS
    if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
        fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
        exit(1);
    }
 #endif // GGML_USE_CUBLAS
    if (escape_prompt) {
        process_escapes(params.prompt);
    }
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -16,6 +16,10 @@ This example allow you to have a llama.cpp http server to interact from a web pa
 To get started right away, run the following command, making sure to use the correct path for the model you have:
 #### Unix-based systems (Linux, macOS, etc.):
 Make sure to build with the server option on
 ```bash
 LLAMA_BUILD_SERVER=1 make
 ```
 ```bash
 ./server -m models/7B/ggml-model.bin --ctx_size 2048
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -25,7 +25,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
        }                                                                               \
    } while (0)
-#if CUDART_VERSION >= 12
+#if CUDART_VERSION >= 12000
 #define CUBLAS_CHECK(err)                                                               \
    do {                                                                                \
        cublasStatus_t err_ = (err);                                                    \
@ -2366,7 +2366,7 @@ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
 }
 void ggml_cuda_set_main_device(int main_device) {
-    if (main_device > g_device_count) {
+    if (main_device >= g_device_count) {
        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
                main_device, g_device_count, g_main_device);
        return;
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -55,6 +55,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
 void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 #ifdef __cplusplus
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -287,15 +287,40 @@ void ggml_metal_graph_compute(
               struct ggml_cgraph * gf) {
    metal_printf("%s: evaluating graph\n", __func__);
    // create multiple command buffers and enqueue them
    // then, we encode the graph into the command buffers in parallel
    const int n_cb = gf->n_threads;
    NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
    for (int i = 0; i < n_cb; ++i) {
        command_buffers[i] = [ctx->queue commandBuffer];
        // enqueue the command buffers in order to specify their execution order
        [command_buffers[i] enqueue];
    }
    // TODO: is this the best way to start threads?
    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
        const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
        dispatch_async(queue, ^{
            size_t offs_src0 = 0;
            size_t offs_src1 = 0;
            size_t offs_dst  = 0;
-    id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
+            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
            id<MTLComputeCommandEncoder> encoder = nil;
-    for (int i = 0; i < gf->n_nodes; ++i) {
+            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-        //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+            const int node_end   = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
            for (int i = node_start; i < node_end; ++i) {
                metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
                struct ggml_tensor * src0 = gf->nodes[i]->src0;
                struct ggml_tensor * src1 = gf->nodes[i]->src1;
@ -626,7 +651,6 @@ void ggml_metal_graph_compute(
                                        }
                                };
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@ -800,12 +824,11 @@ void ggml_metal_graph_compute(
            }
            [command_buffer commit];
-    [command_buffer waitUntilCompleted];
+        });
    {
        const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
        UNUSED(time_elapsed);
        metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
    }
    // wait for all threads to finish
    dispatch_barrier_sync(queue, ^{});
    [command_buffers[n_cb - 1] waitUntilCompleted];
 }
--- a/llama.h
+++ b/llama.h
@ -244,9 +244,9 @@ extern "C" {
    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
    // Special tokens
-    LLAMA_API llama_token llama_token_bos();
+    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos();
+    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
-    LLAMA_API llama_token llama_token_nl();
+    LLAMA_API llama_token llama_token_nl();   // next-line
    // Sampling functions
--- a/spm-headers/ggml.h
+++ b/spm-headers/ggml.h
@ -0,0 +1 @@
 ../ggml.h