Merge branch 'master' into concedo_experimental
# Conflicts: # CMakeLists.txt # Makefile
This commit is contained in:
commit
7ef8d740b9
10 changed files with 532 additions and 452 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -29,6 +29,7 @@ build-sanitize-thread/
|
|||
/result
|
||||
/perplexity
|
||||
/embedding
|
||||
/train-text-from-scratch
|
||||
/benchmark-matmult
|
||||
/vdot
|
||||
/Pipfile
|
||||
|
|
|
@ -11,6 +11,7 @@ let package = Package(
|
|||
.target(
|
||||
name: "llama",
|
||||
path: ".",
|
||||
exclude: ["ggml-metal.metal"],
|
||||
sources: ["ggml.c", "llama.cpp"],
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
|
||||
|
|
41
examples/chat-vicuna.sh
Executable file
41
examples/chat-vicuna.sh
Executable file
|
@ -0,0 +1,41 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")/.." || exit
|
||||
|
||||
MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
|
||||
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
|
||||
USER_NAME="### Human"
|
||||
AI_NAME="### Assistant"
|
||||
|
||||
# Adjust to the number of CPU cores you want to use.
|
||||
N_THREAD="${N_THREAD:-8}"
|
||||
# Number of tokens to predict (made it larger than default because we want a long interaction)
|
||||
N_PREDICTS="${N_PREDICTS:-2048}"
|
||||
|
||||
# Note: you can also override the generation options by specifying them on the command line:
|
||||
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
|
||||
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
|
||||
|
||||
DATE_TIME=$(date +%H:%M)
|
||||
DATE_YEAR=$(date +%Y)
|
||||
|
||||
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
|
||||
|
||||
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
|
||||
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
|
||||
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
|
||||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||
|
||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||
./bin/main $GEN_OPTIONS \
|
||||
--model "$MODEL" \
|
||||
--threads "$N_THREAD" \
|
||||
--n_predict "$N_PREDICTS" \
|
||||
--color --interactive \
|
||||
--file ${PROMPT_FILE} \
|
||||
--reverse-prompt "### Human:" \
|
||||
--in-prefix ' ' \
|
||||
"$@"
|
|
@ -412,6 +412,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
|
||||
fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
|
||||
exit(1);
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
if (escape_prompt) {
|
||||
process_escapes(params.prompt);
|
||||
}
|
||||
|
|
|
@ -16,6 +16,10 @@ This example allow you to have a llama.cpp http server to interact from a web pa
|
|||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||
|
||||
#### Unix-based systems (Linux, macOS, etc.):
|
||||
Make sure to build with the server option on
|
||||
```bash
|
||||
LLAMA_BUILD_SERVER=1 make
|
||||
```
|
||||
|
||||
```bash
|
||||
./server -m models/7B/ggml-model.bin --ctx_size 2048
|
||||
|
|
|
@ -25,7 +25,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
#if CUDART_VERSION >= 12
|
||||
#if CUDART_VERSION >= 12000
|
||||
#define CUBLAS_CHECK(err) \
|
||||
do { \
|
||||
cublasStatus_t err_ = (err); \
|
||||
|
@ -2366,7 +2366,7 @@ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
|||
}
|
||||
|
||||
void ggml_cuda_set_main_device(int main_device) {
|
||||
if (main_device > g_device_count) {
|
||||
if (main_device >= g_device_count) {
|
||||
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
||||
main_device, g_device_count, g_main_device);
|
||||
return;
|
||||
|
|
|
@ -55,6 +55,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
|
|||
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||
|
||||
// same as ggml_graph_compute but uses Metal
|
||||
// creates gf->n_threads command buffers in parallel
|
||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
45
ggml-metal.m
45
ggml-metal.m
|
@ -287,15 +287,40 @@ void ggml_metal_graph_compute(
|
|||
struct ggml_cgraph * gf) {
|
||||
metal_printf("%s: evaluating graph\n", __func__);
|
||||
|
||||
// create multiple command buffers and enqueue them
|
||||
// then, we encode the graph into the command buffers in parallel
|
||||
|
||||
const int n_cb = gf->n_threads;
|
||||
|
||||
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
||||
|
||||
for (int i = 0; i < n_cb; ++i) {
|
||||
command_buffers[i] = [ctx->queue commandBuffer];
|
||||
|
||||
// enqueue the command buffers in order to specify their execution order
|
||||
[command_buffers[i] enqueue];
|
||||
}
|
||||
|
||||
// TODO: is this the best way to start threads?
|
||||
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||
const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
|
||||
|
||||
dispatch_async(queue, ^{
|
||||
size_t offs_src0 = 0;
|
||||
size_t offs_src1 = 0;
|
||||
size_t offs_dst = 0;
|
||||
|
||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
|
||||
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
|
||||
|
||||
id<MTLComputeCommandEncoder> encoder = nil;
|
||||
|
||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||
const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
|
||||
|
||||
for (int i = node_start; i < node_end; ++i) {
|
||||
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||
|
||||
struct ggml_tensor * src0 = gf->nodes[i]->src0;
|
||||
struct ggml_tensor * src1 = gf->nodes[i]->src1;
|
||||
|
@ -626,7 +651,6 @@ void ggml_metal_graph_compute(
|
|||
}
|
||||
};
|
||||
|
||||
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
|
@ -800,12 +824,11 @@ void ggml_metal_graph_compute(
|
|||
}
|
||||
|
||||
[command_buffer commit];
|
||||
[command_buffer waitUntilCompleted];
|
||||
|
||||
{
|
||||
const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
|
||||
UNUSED(time_elapsed);
|
||||
|
||||
metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
|
||||
});
|
||||
}
|
||||
|
||||
// wait for all threads to finish
|
||||
dispatch_barrier_sync(queue, ^{});
|
||||
|
||||
[command_buffers[n_cb - 1] waitUntilCompleted];
|
||||
}
|
||||
|
|
6
llama.h
6
llama.h
|
@ -244,9 +244,9 @@ extern "C" {
|
|||
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos();
|
||||
LLAMA_API llama_token llama_token_eos();
|
||||
LLAMA_API llama_token llama_token_nl();
|
||||
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl(); // next-line
|
||||
|
||||
// Sampling functions
|
||||
|
||||
|
|
1
spm-headers/ggml.h
Symbolic link
1
spm-headers/ggml.h
Symbolic link
|
@ -0,0 +1 @@
|
|||
../ggml.h
|
Loading…
Add table
Add a link
Reference in a new issue