Merge branch 'master' into concedo_experimental

# Conflicts:
#	CMakeLists.txt
#	Makefile
This commit is contained in:
Concedo 2023-06-16 16:37:14 +08:00
commit 7ef8d740b9
10 changed files with 532 additions and 452 deletions

1
.gitignore vendored
View file

@ -29,6 +29,7 @@ build-sanitize-thread/
/result /result
/perplexity /perplexity
/embedding /embedding
/train-text-from-scratch
/benchmark-matmult /benchmark-matmult
/vdot /vdot
/Pipfile /Pipfile

View file

@ -11,6 +11,7 @@ let package = Package(
.target( .target(
name: "llama", name: "llama",
path: ".", path: ".",
exclude: ["ggml-metal.metal"],
sources: ["ggml.c", "llama.cpp"], sources: ["ggml.c", "llama.cpp"],
publicHeadersPath: "spm-headers", publicHeadersPath: "spm-headers",
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")], cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],

41
examples/chat-vicuna.sh Executable file
View file

@ -0,0 +1,41 @@
#!/bin/bash
set -e
cd "$(dirname "$0")/.." || exit
MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
USER_NAME="### Human"
AI_NAME="### Assistant"
# Adjust to the number of CPU cores you want to use.
N_THREAD="${N_THREAD:-8}"
# Number of tokens to predict (made it larger than default because we want a long interaction)
N_PREDICTS="${N_PREDICTS:-2048}"
# Note: you can also override the generation options by specifying them on the command line:
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
DATE_TIME=$(date +%H:%M)
DATE_YEAR=$(date +%Y)
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./bin/main $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--file ${PROMPT_FILE} \
--reverse-prompt "### Human:" \
--in-prefix ' ' \
"$@"

View file

@ -412,6 +412,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
gpt_print_usage(argc, argv, default_params); gpt_print_usage(argc, argv, default_params);
exit(1); exit(1);
} }
#ifdef GGML_USE_CUBLAS
if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
exit(1);
}
#endif // GGML_USE_CUBLAS
if (escape_prompt) { if (escape_prompt) {
process_escapes(params.prompt); process_escapes(params.prompt);
} }

View file

@ -16,6 +16,10 @@ This example allow you to have a llama.cpp http server to interact from a web pa
To get started right away, run the following command, making sure to use the correct path for the model you have: To get started right away, run the following command, making sure to use the correct path for the model you have:
#### Unix-based systems (Linux, macOS, etc.): #### Unix-based systems (Linux, macOS, etc.):
Make sure to build with the server option on
```bash
LLAMA_BUILD_SERVER=1 make
```
```bash ```bash
./server -m models/7B/ggml-model.bin --ctx_size 2048 ./server -m models/7B/ggml-model.bin --ctx_size 2048

View file

@ -25,7 +25,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
} \ } \
} while (0) } while (0)
#if CUDART_VERSION >= 12 #if CUDART_VERSION >= 12000
#define CUBLAS_CHECK(err) \ #define CUBLAS_CHECK(err) \
do { \ do { \
cublasStatus_t err_ = (err); \ cublasStatus_t err_ = (err); \
@ -2366,7 +2366,7 @@ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
} }
void ggml_cuda_set_main_device(int main_device) { void ggml_cuda_set_main_device(int main_device) {
if (main_device > g_device_count) { if (main_device >= g_device_count) {
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
main_device, g_device_count, g_main_device); main_device, g_device_count, g_main_device);
return; return;

View file

@ -55,6 +55,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
// same as ggml_graph_compute but uses Metal // same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -287,15 +287,40 @@ void ggml_metal_graph_compute(
struct ggml_cgraph * gf) { struct ggml_cgraph * gf) {
metal_printf("%s: evaluating graph\n", __func__); metal_printf("%s: evaluating graph\n", __func__);
// create multiple command buffers and enqueue them
// then, we encode the graph into the command buffers in parallel
const int n_cb = gf->n_threads;
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
for (int i = 0; i < n_cb; ++i) {
command_buffers[i] = [ctx->queue commandBuffer];
// enqueue the command buffers in order to specify their execution order
[command_buffers[i] enqueue];
}
// TODO: is this the best way to start threads?
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
dispatch_async(queue, ^{
size_t offs_src0 = 0; size_t offs_src0 = 0;
size_t offs_src1 = 0; size_t offs_src1 = 0;
size_t offs_dst = 0; size_t offs_dst = 0;
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer]; id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
id<MTLComputeCommandEncoder> encoder = nil; id<MTLComputeCommandEncoder> encoder = nil;
for (int i = 0; i < gf->n_nodes; ++i) { const int node_start = (cb_idx + 0) * n_nodes_per_cb;
//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
for (int i = node_start; i < node_end; ++i) {
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
struct ggml_tensor * src0 = gf->nodes[i]->src0; struct ggml_tensor * src0 = gf->nodes[i]->src0;
struct ggml_tensor * src1 = gf->nodes[i]->src1; struct ggml_tensor * src1 = gf->nodes[i]->src1;
@ -626,7 +651,6 @@ void ggml_metal_graph_compute(
} }
}; };
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
@ -800,12 +824,11 @@ void ggml_metal_graph_compute(
} }
[command_buffer commit]; [command_buffer commit];
[command_buffer waitUntilCompleted]; });
{
const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
UNUSED(time_elapsed);
metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
} }
// wait for all threads to finish
dispatch_barrier_sync(queue, ^{});
[command_buffers[n_cb - 1] waitUntilCompleted];
} }

View file

@ -244,9 +244,9 @@ extern "C" {
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
// Special tokens // Special tokens
LLAMA_API llama_token llama_token_bos(); LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(); LLAMA_API llama_token llama_token_eos(); // end-of-sentence
LLAMA_API llama_token llama_token_nl(); LLAMA_API llama_token llama_token_nl(); // next-line
// Sampling functions // Sampling functions

1
spm-headers/ggml.h Symbolic link
View file

@ -0,0 +1 @@
../ggml.h