Merge branch 'master' into add_gpt2_support
This commit is contained in:
commit
c4d1c26710
24 changed files with 1018 additions and 421 deletions
177
.github/ISSUE_TEMPLATE/bug.md
vendored
177
.github/ISSUE_TEMPLATE/bug.md
vendored
|
@ -6,179 +6,4 @@ assignees: ''
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# Prerequisites
|
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
|
||||||
|
|
||||||
Please answer the following questions for yourself before submitting an issue.
|
|
||||||
|
|
||||||
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
|
||||||
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
|
||||||
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
|
||||||
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
|
||||||
|
|
||||||
# Expected Behavior
|
|
||||||
|
|
||||||
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
|
|
||||||
|
|
||||||
# Current Behavior
|
|
||||||
|
|
||||||
Please provide a detailed written description of what `llama.cpp` did, instead.
|
|
||||||
|
|
||||||
# Environment and Context
|
|
||||||
|
|
||||||
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
|
|
||||||
|
|
||||||
* Physical (or virtual) hardware you are using, e.g. for Linux:
|
|
||||||
|
|
||||||
`$ lscpu`
|
|
||||||
|
|
||||||
* Operating System, e.g. for Linux:
|
|
||||||
|
|
||||||
`$ uname -a`
|
|
||||||
|
|
||||||
* SDK version, e.g. for Linux:
|
|
||||||
|
|
||||||
```
|
|
||||||
$ python3 --version
|
|
||||||
$ make --version
|
|
||||||
$ g++ --version
|
|
||||||
```
|
|
||||||
|
|
||||||
# Failure Information (for bugs)
|
|
||||||
|
|
||||||
Please help provide information about the failure / bug.
|
|
||||||
|
|
||||||
# Steps to Reproduce
|
|
||||||
|
|
||||||
Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
|
|
||||||
|
|
||||||
1. step 1
|
|
||||||
2. step 2
|
|
||||||
3. step 3
|
|
||||||
4. etc.
|
|
||||||
|
|
||||||
# Failure Logs
|
|
||||||
|
|
||||||
Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
|
|
||||||
|
|
||||||
Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
|
|
||||||
|
|
||||||
Example environment info:
|
|
||||||
```
|
|
||||||
llama.cpp$ git log | head -1
|
|
||||||
commit 2af23d30434a677c6416812eea52ccc0af65119c
|
|
||||||
|
|
||||||
llama.cpp$ lscpu | egrep "AMD|Flags"
|
|
||||||
Vendor ID: AuthenticAMD
|
|
||||||
Model name: AMD Ryzen Threadripper 1950X 16-Core Processor
|
|
||||||
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev
|
|
||||||
Virtualization: AMD-V
|
|
||||||
|
|
||||||
llama.cpp$ python3 --version
|
|
||||||
Python 3.10.9
|
|
||||||
|
|
||||||
llama.cpp$ pip list | egrep "torch|numpy|sentencepiece"
|
|
||||||
numpy 1.24.2
|
|
||||||
numpydoc 1.5.0
|
|
||||||
sentencepiece 0.1.97
|
|
||||||
torch 1.13.1
|
|
||||||
torchvision 0.14.1
|
|
||||||
|
|
||||||
llama.cpp$ make --version | head -1
|
|
||||||
GNU Make 4.3
|
|
||||||
|
|
||||||
$ md5sum ./models/65B/ggml-model-q4_0.bin
|
|
||||||
dbdd682cce80e2d6e93cefc7449df487 ./models/65B/ggml-model-q4_0.bin
|
|
||||||
```
|
|
||||||
|
|
||||||
Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
|
|
||||||
```
|
|
||||||
llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
|
|
||||||
main: seed = 1679149377
|
|
||||||
llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ...
|
|
||||||
llama_model_load: n_vocab = 32000
|
|
||||||
llama_model_load: n_ctx = 512
|
|
||||||
llama_model_load: n_embd = 8192
|
|
||||||
llama_model_load: n_mult = 256
|
|
||||||
llama_model_load: n_head = 64
|
|
||||||
llama_model_load: n_layer = 80
|
|
||||||
llama_model_load: n_rot = 128
|
|
||||||
llama_model_load: f16 = 2
|
|
||||||
llama_model_load: n_ff = 22016
|
|
||||||
llama_model_load: n_parts = 8
|
|
||||||
llama_model_load: ggml ctx size = 41477.73 MB
|
|
||||||
llama_model_load: memory_size = 2560.00 MB, n_mem = 40960
|
|
||||||
llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin'
|
|
||||||
llama_model_load: .......................................................................................... done
|
|
||||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
|
||||||
llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1'
|
|
||||||
llama_model_load: .......................................................................................... done
|
|
||||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
|
||||||
llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2'
|
|
||||||
llama_model_load: .......................................................................................... done
|
|
||||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
|
||||||
llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3'
|
|
||||||
llama_model_load: .......................................................................................... done
|
|
||||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
|
||||||
llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4'
|
|
||||||
llama_model_load: .......................................................................................... done
|
|
||||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
|
||||||
llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5'
|
|
||||||
llama_model_load: .......................................................................................... done
|
|
||||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
|
||||||
llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6'
|
|
||||||
llama_model_load: .......................................................................................... done
|
|
||||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
|
||||||
llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7'
|
|
||||||
llama_model_load: .......................................................................................... done
|
|
||||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
|
||||||
|
|
||||||
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
|
|
||||||
|
|
||||||
main: prompt: 'Please close your issue when it has been answered.'
|
|
||||||
main: number of tokens in prompt = 11
|
|
||||||
1 -> ''
|
|
||||||
12148 -> 'Please'
|
|
||||||
3802 -> ' close'
|
|
||||||
596 -> ' your'
|
|
||||||
2228 -> ' issue'
|
|
||||||
746 -> ' when'
|
|
||||||
372 -> ' it'
|
|
||||||
756 -> ' has'
|
|
||||||
1063 -> ' been'
|
|
||||||
7699 -> ' answered'
|
|
||||||
29889 -> '.'
|
|
||||||
|
|
||||||
sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
|
|
||||||
|
|
||||||
|
|
||||||
Please close your issue when it has been answered.
|
|
||||||
@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine??
|
|
||||||
I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!!
|
|
||||||
@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text]
|
|
||||||
|
|
||||||
|
|
||||||
main: mem per token = 71159620 bytes
|
|
||||||
main: load time = 19309.95 ms
|
|
||||||
main: sample time = 168.62 ms
|
|
||||||
main: predict time = 223895.61 ms / 888.47 ms per token
|
|
||||||
main: total time = 246406.42 ms
|
|
||||||
|
|
||||||
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
|
|
||||||
|
|
||||||
3636882.89 msec task-clock # 14.677 CPUs utilized
|
|
||||||
13509 context-switches # 3.714 /sec
|
|
||||||
2436 cpu-migrations # 0.670 /sec
|
|
||||||
10476679 page-faults # 2.881 K/sec
|
|
||||||
13133115082869 cycles # 3.611 GHz (16.77%)
|
|
||||||
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
|
|
||||||
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
|
|
||||||
23479217109614 instructions # 1.79 insn per cycle
|
|
||||||
# 0.44 stalled cycles per insn (16.76%)
|
|
||||||
2353072268027 branches # 647.002 M/sec (16.77%)
|
|
||||||
1998682780 branch-misses # 0.08% of all branches (16.76%)
|
|
||||||
|
|
||||||
247.802177522 seconds time elapsed
|
|
||||||
|
|
||||||
3618.573072000 seconds user
|
|
||||||
18.491698000 seconds sys
|
|
||||||
```
|
|
||||||
|
|
2
.github/workflows/docker.yml
vendored
2
.github/workflows/docker.yml
vendored
|
@ -98,5 +98,5 @@ jobs:
|
||||||
context: .
|
context: .
|
||||||
push: ${{ github.event_name == 'push' }}
|
push: ${{ github.event_name == 'push' }}
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}" , "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -48,6 +48,7 @@ models-mnt
|
||||||
/llama-bench
|
/llama-bench
|
||||||
/llava-cli
|
/llava-cli
|
||||||
/lookahead
|
/lookahead
|
||||||
|
/lookup
|
||||||
/main
|
/main
|
||||||
/metal
|
/metal
|
||||||
/perplexity
|
/perplexity
|
||||||
|
|
|
@ -302,6 +302,8 @@ if (LLAMA_CUBLAS)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
|
||||||
|
|
||||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
# 52 == lowest CUDA 12 standard
|
# 52 == lowest CUDA 12 standard
|
||||||
# 60 == f16 CUDA intrinsics
|
# 60 == f16 CUDA intrinsics
|
||||||
|
|
11
Makefile
11
Makefile
|
@ -2,7 +2,7 @@
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||||
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead tests/test-c.o
|
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
|
@ -367,17 +367,15 @@ endif # LLAMA_BLIS
|
||||||
|
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
|
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
|
||||||
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
MK_NVCCFLAGS = -use_fast_math
|
MK_NVCCFLAGS = -use_fast_math
|
||||||
ifndef JETSON_EOL_MODULE_DETECT
|
ifndef JETSON_EOL_MODULE_DETECT
|
||||||
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
||||||
endif # JETSON_EOL_MODULE_DETECT
|
endif # JETSON_EOL_MODULE_DETECT
|
||||||
|
|
||||||
ifdef LLAMA_DEBUG
|
ifdef LLAMA_DEBUG
|
||||||
MK_NVCCFLAGS += -lineinfo
|
MK_NVCCFLAGS += -lineinfo
|
||||||
endif
|
endif # LLAMA_DEBUG
|
||||||
|
|
||||||
ifdef LLAMA_CUDA_NVCC
|
ifdef LLAMA_CUDA_NVCC
|
||||||
NVCC = $(LLAMA_CUDA_NVCC)
|
NVCC = $(LLAMA_CUDA_NVCC)
|
||||||
else
|
else
|
||||||
|
@ -664,6 +662,9 @@ parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
|
@ -102,6 +102,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
|
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
|
||||||
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
||||||
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
||||||
|
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
|
||||||
|
|
||||||
**Multimodal models:**
|
**Multimodal models:**
|
||||||
|
|
||||||
|
@ -132,6 +133,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||||
- [semperai/amica](https://github.com/semperai/amica)
|
- [semperai/amica](https://github.com/semperai/amica)
|
||||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
||||||
|
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -51,7 +51,7 @@ struct gpt_params {
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
int32_t n_sequences = 1; // number of sequences to decode
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
|
@ -240,3 +240,4 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||||
|
|
||||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||||
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||||
|
|
||||||
|
|
|
@ -149,11 +149,12 @@ static void sampler_queue(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sampling_sample(
|
static llama_token llama_sampling_sample_impl(
|
||||||
struct llama_sampling_context * ctx_sampling,
|
struct llama_sampling_context * ctx_sampling,
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
struct llama_context * ctx_cfg,
|
struct llama_context * ctx_cfg,
|
||||||
const int idx) {
|
const int idx,
|
||||||
|
bool is_resampling) { // Add a parameter to indicate if we are resampling
|
||||||
const llama_sampling_params & params = ctx_sampling->params;
|
const llama_sampling_params & params = ctx_sampling->params;
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||||
|
@ -173,8 +174,17 @@ llama_token llama_sampling_sample(
|
||||||
|
|
||||||
llama_token id = 0;
|
llama_token id = 0;
|
||||||
|
|
||||||
|
// Get a pointer to the logits
|
||||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||||
|
|
||||||
|
// Declare original_logits at the beginning of the function scope
|
||||||
|
std::vector<float> original_logits;
|
||||||
|
|
||||||
|
if (!is_resampling) {
|
||||||
|
// Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
|
||||||
|
original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
|
||||||
|
}
|
||||||
|
|
||||||
// apply params.logit_bias map
|
// apply params.logit_bias map
|
||||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||||
logits[it->first] += it->second;
|
logits[it->first] += it->second;
|
||||||
|
@ -193,12 +203,14 @@ llama_token llama_sampling_sample(
|
||||||
}
|
}
|
||||||
|
|
||||||
// apply penalties
|
// apply penalties
|
||||||
if (!prev.empty()) {
|
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
|
||||||
|
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
|
||||||
|
if (penalty_tokens_used_size) {
|
||||||
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
|
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
|
||||||
|
|
||||||
llama_sample_repetition_penalties(ctx_main, &cur_p,
|
llama_sample_repetition_penalties(ctx_main, &cur_p,
|
||||||
prev.data() + prev.size() - penalty_last_n,
|
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
|
||||||
penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
|
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
|
||||||
|
|
||||||
if (!penalize_nl) {
|
if (!penalize_nl) {
|
||||||
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
||||||
|
@ -210,7 +222,8 @@ llama_token llama_sampling_sample(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx_sampling->grammar != NULL) {
|
// If we are in the resampling phase, apply grammar checks before sampling logic
|
||||||
|
if (is_resampling && ctx_sampling->grammar != NULL) {
|
||||||
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -252,9 +265,40 @@ llama_token llama_sampling_sample(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
||||||
|
// Create an array with a single token data element for the sampled id
|
||||||
|
llama_token_data single_token_data = {id, logits[id], 0.0f};
|
||||||
|
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
|
||||||
|
|
||||||
|
// Apply grammar constraints to the single token
|
||||||
|
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
|
||||||
|
|
||||||
|
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
|
||||||
|
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||||
|
|
||||||
|
// If the token is not valid according to the grammar, perform resampling
|
||||||
|
if (!is_valid) {
|
||||||
|
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
||||||
|
|
||||||
|
// Restore logits from the copy
|
||||||
|
std::copy(original_logits.begin(), original_logits.end(), logits);
|
||||||
|
|
||||||
|
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_token llama_sampling_sample(
|
||||||
|
struct llama_sampling_context * ctx_sampling,
|
||||||
|
struct llama_context * ctx_main,
|
||||||
|
struct llama_context * ctx_cfg,
|
||||||
|
const int idx) {
|
||||||
|
// Call the implementation function with is_resampling set to false by default
|
||||||
|
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_sampling_accept(
|
void llama_sampling_accept(
|
||||||
struct llama_sampling_context * ctx_sampling,
|
struct llama_sampling_context * ctx_sampling,
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
|
|
|
@ -36,6 +36,9 @@ typedef struct llama_sampling_params {
|
||||||
float cfg_scale = 1.f; // how strong is guidance
|
float cfg_scale = 1.f; // how strong is guidance
|
||||||
|
|
||||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||||
|
|
||||||
|
std::vector<llama_token> penalty_prompt_tokens;
|
||||||
|
bool use_penalty_prompt_tokens = false;
|
||||||
} llama_sampling_params;
|
} llama_sampling_params;
|
||||||
|
|
||||||
// general sampler context
|
// general sampler context
|
||||||
|
|
|
@ -186,6 +186,8 @@ class Model:
|
||||||
return GPT2Model
|
return GPT2Model
|
||||||
if model_architecture == "PhiForCausalLM":
|
if model_architecture == "PhiForCausalLM":
|
||||||
return Phi2Model
|
return Phi2Model
|
||||||
|
if model_architecture == "PlamoForCausalLM":
|
||||||
|
return PlamoModel
|
||||||
return Model
|
return Model
|
||||||
|
|
||||||
def _is_model_safetensors(self) -> bool:
|
def _is_model_safetensors(self) -> bool:
|
||||||
|
@ -229,6 +231,8 @@ class Model:
|
||||||
return gguf.MODEL_ARCH.GPT2
|
return gguf.MODEL_ARCH.GPT2
|
||||||
if arch == "PhiForCausalLM":
|
if arch == "PhiForCausalLM":
|
||||||
return gguf.MODEL_ARCH.PHI2
|
return gguf.MODEL_ARCH.PHI2
|
||||||
|
if arch == "PlamoForCausalLM":
|
||||||
|
return gguf.MODEL_ARCH.PLAMO
|
||||||
|
|
||||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||||
|
|
||||||
|
@ -1068,11 +1072,91 @@ class Phi2Model(Model):
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
|
||||||
|
|
||||||
|
class PlamoModel(Model):
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
hparams = self.hparams
|
||||||
|
block_count = hparams["num_hidden_layers"]
|
||||||
|
|
||||||
|
self.gguf_writer.add_name("PLaMo")
|
||||||
|
self.gguf_writer.add_context_length(4096) # not in config.json
|
||||||
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||||
|
|
||||||
|
def shuffle_attn_q_weight(self, data_torch):
|
||||||
|
assert data_torch.size() == (5120, 5120)
|
||||||
|
data_torch = data_torch.reshape(8, 5, 128, 5120)
|
||||||
|
data_torch = torch.permute(data_torch, (1, 0, 2, 3))
|
||||||
|
data_torch = torch.reshape(data_torch, (5120, 5120))
|
||||||
|
return data_torch
|
||||||
|
|
||||||
|
def shuffle_attn_output_weight(self, data_torch):
|
||||||
|
assert data_torch.size() == (5120, 5120)
|
||||||
|
data_torch = data_torch.reshape(5120, 8, 5, 128)
|
||||||
|
data_torch = torch.permute(data_torch, (0, 2, 1, 3))
|
||||||
|
data_torch = torch.reshape(data_torch, (5120, 5120))
|
||||||
|
return data_torch
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
|
||||||
|
for name, data_torch in self.get_tensors():
|
||||||
|
if "self_attn.rotary_emb.inv_freq" in name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
# shuffle for broadcasting of gqa in ggml_mul_mat
|
||||||
|
if new_name.endswith("attn_q.weight"):
|
||||||
|
data_torch = self.shuffle_attn_q_weight(data_torch)
|
||||||
|
elif new_name.endswith("attn_output.weight"):
|
||||||
|
data_torch = self.shuffle_attn_output_weight(data_torch)
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Convert a huggingface model to a GGML compatible file")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--vocab-only", action="store_true",
|
"--vocab-only", action="store_true",
|
||||||
help="extract only the vocab",
|
help="extract only the vocab",
|
||||||
|
|
|
@ -33,6 +33,7 @@ else()
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
|
add_subdirectory(lookup)
|
||||||
add_subdirectory(train-text-from-scratch)
|
add_subdirectory(train-text-from-scratch)
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
add_subdirectory(metal)
|
add_subdirectory(metal)
|
||||||
|
|
5
examples/lookup/CMakeLists.txt
Normal file
5
examples/lookup/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET lookup)
|
||||||
|
add_executable(${TARGET} lookup.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
13
examples/lookup/README.md
Normal file
13
examples/lookup/README.md
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
# llama.cpp/examples/lookup
|
||||||
|
|
||||||
|
Demonstration of Prompt Lookup Decoding
|
||||||
|
|
||||||
|
https://github.com/apoorvumang/prompt-lookup-decoding
|
||||||
|
|
||||||
|
The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft`. The first two determine the size of the ngrams to search for in the prompt for a match. The latter specifies how many subsequent tokens to draft if a match is found.
|
||||||
|
|
||||||
|
More info:
|
||||||
|
|
||||||
|
https://github.com/ggerganov/llama.cpp/pull/4484
|
||||||
|
https://github.com/ggerganov/llama.cpp/issues/4226
|
||||||
|
|
230
examples/lookup/lookup.cpp
Normal file
230
examples/lookup/lookup.cpp
Normal file
|
@ -0,0 +1,230 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
int main(int argc, char ** argv){
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// max/min n-grams size to search for in prompt
|
||||||
|
const int ngram_max = 4;
|
||||||
|
const int ngram_min = 1;
|
||||||
|
|
||||||
|
// length of the candidate / draft sequence, if match is found
|
||||||
|
const int n_draft = params.n_draft;
|
||||||
|
|
||||||
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
|
#ifndef LOG_DISABLE_LOGS
|
||||||
|
log_set_target(log_filename_generator("lookup", "log"));
|
||||||
|
LOG_TEE("Log start\n");
|
||||||
|
log_dump_cmdline(argc, argv);
|
||||||
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
|
// init llama.cpp
|
||||||
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
|
llama_model * model = NULL;
|
||||||
|
llama_context * ctx = NULL;
|
||||||
|
|
||||||
|
// load the model
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
// tokenize the prompt
|
||||||
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
LOG("add_bos tgt: %d\n", add_bos);
|
||||||
|
|
||||||
|
std::vector<llama_token> inp;
|
||||||
|
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||||
|
|
||||||
|
const int max_context_size = llama_n_ctx(ctx);
|
||||||
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
||||||
|
if ((int) inp.size() > max_tokens_list_size) {
|
||||||
|
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
|
for (auto id : inp) {
|
||||||
|
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
fflush(stderr);
|
||||||
|
|
||||||
|
const int n_input = inp.size();
|
||||||
|
|
||||||
|
const auto t_enc_start = ggml_time_us();
|
||||||
|
|
||||||
|
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
|
||||||
|
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
|
||||||
|
|
||||||
|
const auto t_enc_end = ggml_time_us();
|
||||||
|
|
||||||
|
int n_predict = 0;
|
||||||
|
int n_drafted = 0;
|
||||||
|
int n_accept = 0;
|
||||||
|
|
||||||
|
int n_past = inp.size();
|
||||||
|
|
||||||
|
bool has_eos = false;
|
||||||
|
|
||||||
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
||||||
|
|
||||||
|
std::vector<llama_token> draft;
|
||||||
|
|
||||||
|
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
|
||||||
|
|
||||||
|
// debug
|
||||||
|
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
|
||||||
|
|
||||||
|
const auto t_dec_start = ggml_time_us();
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
// debug
|
||||||
|
if (dump_kv_cache) {
|
||||||
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
|
dump_kv_cache_view_seqs(kvc_view, 40);
|
||||||
|
}
|
||||||
|
|
||||||
|
// print current draft sequence
|
||||||
|
LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
|
||||||
|
|
||||||
|
int i_dft = 0;
|
||||||
|
while (true) {
|
||||||
|
// sample from the target model
|
||||||
|
llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
|
||||||
|
|
||||||
|
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||||
|
|
||||||
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
|
if (!params.use_color) {
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (id == llama_token_eos(model)) {
|
||||||
|
has_eos = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
++n_predict;
|
||||||
|
|
||||||
|
// check if the target token matches the draft
|
||||||
|
if (i_dft < (int) draft.size() && id == draft[i_dft]) {
|
||||||
|
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
|
||||||
|
++n_accept;
|
||||||
|
++n_past;
|
||||||
|
++i_dft;
|
||||||
|
inp.push_back(id);
|
||||||
|
|
||||||
|
if (params.use_color) {
|
||||||
|
// color accepted draft token
|
||||||
|
printf("\033[34m%s\033[0m", token_str.c_str());
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.use_color) {
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
|
||||||
|
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||||
|
|
||||||
|
draft.clear();
|
||||||
|
draft.push_back(id);
|
||||||
|
inp.push_back(id);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((params.n_predict > 0 && n_predict > params.n_predict) || has_eos) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// KV cache management
|
||||||
|
// clean the cache of draft tokens that weren't accepted
|
||||||
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||||
|
|
||||||
|
llama_batch_clear(batch_tgt);
|
||||||
|
llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
||||||
|
|
||||||
|
// generate n_pred tokens through prompt lookup
|
||||||
|
auto prompt_lookup = [&]() -> void {
|
||||||
|
int inp_size = inp.size();
|
||||||
|
for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
|
||||||
|
const llama_token * ngram = &inp[inp_size - ngram_size];
|
||||||
|
|
||||||
|
for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
|
||||||
|
bool match = true;
|
||||||
|
for (int j = 0; j < ngram_size; ++j) {
|
||||||
|
if (inp[i + j] != ngram[j]) {
|
||||||
|
match = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (match) {
|
||||||
|
const int startIdx = i + ngram_size;
|
||||||
|
const int endIdx = startIdx + n_draft;
|
||||||
|
if (endIdx < inp_size) {
|
||||||
|
for (int j = startIdx; j < endIdx; ++j) {
|
||||||
|
LOG(" - draft candidate %d: %d\n", j, inp[j]);
|
||||||
|
draft.push_back(inp[j]);
|
||||||
|
llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true);
|
||||||
|
++n_drafted;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
prompt_lookup();
|
||||||
|
|
||||||
|
llama_decode(ctx, batch_tgt);
|
||||||
|
++n_past;
|
||||||
|
|
||||||
|
draft.erase(draft.begin());
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t_dec_end = ggml_time_us();
|
||||||
|
|
||||||
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
|
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||||
|
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||||
|
|
||||||
|
LOG_TEE("\n");
|
||||||
|
LOG_TEE("n_draft = %d\n", n_draft);
|
||||||
|
LOG_TEE("n_predict = %d\n", n_predict);
|
||||||
|
LOG_TEE("n_drafted = %d\n", n_drafted);
|
||||||
|
LOG_TEE("n_accept = %d\n", n_accept);
|
||||||
|
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
|
LOG_TEE("\ntarget:\n");
|
||||||
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
llama_sampling_free(ctx_sampling);
|
||||||
|
llama_batch_free(batch_tgt);
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -148,6 +148,8 @@ node index.js
|
||||||
|
|
||||||
`frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
|
`frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);
|
||||||
|
|
||||||
|
`penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens (default: `null` = use the original `prompt`).
|
||||||
|
|
||||||
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
|
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
|
||||||
|
|
||||||
`mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
|
`mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
|
||||||
|
|
|
@ -761,6 +761,42 @@ struct llama_server_context
|
||||||
slot->prompt = "";
|
slot->prompt = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slot->sparams.penalty_prompt_tokens.clear();
|
||||||
|
slot->sparams.use_penalty_prompt_tokens = false;
|
||||||
|
const auto &penalty_prompt = data.find("penalty_prompt");
|
||||||
|
if (penalty_prompt != data.end())
|
||||||
|
{
|
||||||
|
if (penalty_prompt->is_string())
|
||||||
|
{
|
||||||
|
const auto penalty_prompt_string = penalty_prompt->get<std::string>();
|
||||||
|
auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
|
||||||
|
slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
|
||||||
|
if (slot->params.n_predict > 0)
|
||||||
|
{
|
||||||
|
slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
|
||||||
|
}
|
||||||
|
slot->sparams.use_penalty_prompt_tokens = true;
|
||||||
|
}
|
||||||
|
else if (penalty_prompt->is_array())
|
||||||
|
{
|
||||||
|
const auto n_tokens = penalty_prompt->size();
|
||||||
|
slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
|
||||||
|
const int n_vocab = llama_n_vocab(model);
|
||||||
|
for (const auto &penalty_token : *penalty_prompt)
|
||||||
|
{
|
||||||
|
if (penalty_token.is_number_integer())
|
||||||
|
{
|
||||||
|
const auto tok = penalty_token.get<llama_token>();
|
||||||
|
if (tok >= 0 && tok < n_vocab)
|
||||||
|
{
|
||||||
|
slot->sparams.penalty_prompt_tokens.push_back(tok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slot->sparams.use_penalty_prompt_tokens = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
slot->sparams.logit_bias.clear();
|
slot->sparams.logit_bias.clear();
|
||||||
|
|
||||||
if (json_value(data, "ignore_eos", false))
|
if (json_value(data, "ignore_eos", false))
|
||||||
|
@ -992,6 +1028,12 @@ struct llama_server_context
|
||||||
slot.generated_text += token_str;
|
slot.generated_text += token_str;
|
||||||
slot.has_next_token = true;
|
slot.has_next_token = true;
|
||||||
|
|
||||||
|
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
||||||
|
{
|
||||||
|
// we can change penalty_prompt_tokens because it is always created from scratch each request
|
||||||
|
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
|
||||||
|
}
|
||||||
|
|
||||||
// check if there is incomplete UTF-8 character at the end
|
// check if there is incomplete UTF-8 character at the end
|
||||||
bool incomplete = false;
|
bool incomplete = false;
|
||||||
for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
|
for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
|
||||||
|
@ -1183,6 +1225,8 @@ struct llama_server_context
|
||||||
{"repeat_penalty", slot.sparams.penalty_repeat},
|
{"repeat_penalty", slot.sparams.penalty_repeat},
|
||||||
{"presence_penalty", slot.sparams.penalty_present},
|
{"presence_penalty", slot.sparams.penalty_present},
|
||||||
{"frequency_penalty", slot.sparams.penalty_freq},
|
{"frequency_penalty", slot.sparams.penalty_freq},
|
||||||
|
{"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
|
||||||
|
{"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
|
||||||
{"mirostat", slot.sparams.mirostat},
|
{"mirostat", slot.sparams.mirostat},
|
||||||
{"mirostat_tau", slot.sparams.mirostat_tau},
|
{"mirostat_tau", slot.sparams.mirostat_tau},
|
||||||
{"mirostat_eta", slot.sparams.mirostat_eta},
|
{"mirostat_eta", slot.sparams.mirostat_eta},
|
||||||
|
|
|
@ -297,7 +297,7 @@ static void ggml_backend_registry_init(void) {
|
||||||
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||||
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
||||||
|
|
||||||
int id = ggml_backend_registry_count;
|
size_t id = ggml_backend_registry_count;
|
||||||
|
|
||||||
ggml_backend_registry[id] = (struct ggml_backend_reg) {
|
ggml_backend_registry[id] = (struct ggml_backend_reg) {
|
||||||
/* .name = */ {0},
|
/* .name = */ {0},
|
||||||
|
@ -330,6 +330,8 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// not found
|
||||||
return SIZE_MAX;
|
return SIZE_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -340,15 +342,15 @@ ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str)
|
||||||
const char * params = strchr(backend_str, ':');
|
const char * params = strchr(backend_str, ':');
|
||||||
char backend_name[128];
|
char backend_name[128];
|
||||||
if (params == NULL) {
|
if (params == NULL) {
|
||||||
strcpy(backend_name, backend_str);
|
snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
|
||||||
params = "";
|
params = "";
|
||||||
} else {
|
} else {
|
||||||
strncpy(backend_name, backend_str, params - backend_str);
|
snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
|
||||||
backend_name[params - backend_str] = '\0';
|
|
||||||
params++;
|
params++;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
|
size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
|
||||||
|
|
||||||
if (backend_i == SIZE_MAX) {
|
if (backend_i == SIZE_MAX) {
|
||||||
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
|
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -396,18 +398,12 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
|
|
517
ggml-cuda.cu
517
ggml-cuda.cu
|
@ -86,17 +86,29 @@
|
||||||
#define cudaStream_t hipStream_t
|
#define cudaStream_t hipStream_t
|
||||||
#define cudaSuccess hipSuccess
|
#define cudaSuccess hipSuccess
|
||||||
#define __trap abort
|
#define __trap abort
|
||||||
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||||
|
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
||||||
|
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
||||||
|
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
||||||
|
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
||||||
|
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
||||||
|
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
||||||
|
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
||||||
|
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
||||||
#else
|
#else
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
// CUDA 10.2 does not have these macro definitions.
|
|
||||||
#ifndef CUBLAS_TF32_TENSOR_OP_MATH
|
#if CUDART_VERSION < 11020
|
||||||
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
||||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||||
#define cublasComputeType_t cudaDataType_t
|
#define cublasComputeType_t cudaDataType_t
|
||||||
#endif
|
#endif // CUDART_VERSION < 11020
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
|
@ -200,45 +212,45 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||||
|
|
||||||
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||||
|
|
||||||
#define CUDA_CHECK(err) \
|
|
||||||
do { \
|
|
||||||
cudaError_t err_ = (err); \
|
|
||||||
if (err_ != cudaSuccess) { \
|
|
||||||
int id; \
|
|
||||||
cudaGetDevice(&id); \
|
|
||||||
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
|
||||||
cudaGetErrorString(err_)); \
|
|
||||||
fprintf(stderr, "current device: %d\n", id); \
|
|
||||||
GGML_ASSERT(!"CUDA error"); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#if CUDART_VERSION >= 12000
|
#if CUDART_VERSION >= 12000
|
||||||
#define CUBLAS_CHECK(err) \
|
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||||
do { \
|
return cublasGetStatusString(err);
|
||||||
cublasStatus_t err_ = (err); \
|
}
|
||||||
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
|
||||||
int id; \
|
|
||||||
cudaGetDevice(&id); \
|
|
||||||
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
|
||||||
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
|
||||||
fprintf(stderr, "current device: %d\n", id); \
|
|
||||||
GGML_ASSERT(!"cuBLAS error"); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
#else
|
#else
|
||||||
#define CUBLAS_CHECK(err) \
|
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||||
do { \
|
switch (err) {
|
||||||
cublasStatus_t err_ = (err); \
|
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
|
||||||
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
|
||||||
int id; \
|
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
|
||||||
cudaGetDevice(&id); \
|
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
|
||||||
fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
|
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
|
||||||
fprintf(stderr, "current device: %d\n", id); \
|
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
|
||||||
GGML_ASSERT(!"cuBLAS error"); \
|
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
|
||||||
} \
|
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
|
||||||
} while (0)
|
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
|
||||||
#endif // CUDART_VERSION >= 11
|
default: return "unknown error";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // CUDART_VERSION >= 12000
|
||||||
|
|
||||||
|
[[noreturn]]
|
||||||
|
static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
|
||||||
|
fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg);
|
||||||
|
fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
|
||||||
|
GGML_ASSERT(!"CUDA error");
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CUDA_CHECK(err) do { auto err_ = (err); if (err_ != cudaSuccess) ggml_cuda_error(#err, __func__, __FILE__, __LINE__, cudaGetErrorString(err_)); } while (0)
|
||||||
|
#define CUBLAS_CHECK(err) do { auto err_ = (err); if (err_ != CUBLAS_STATUS_SUCCESS) ggml_cuda_error(#err, __func__, __FILE__, __LINE__, cublas_get_error_str(err_)); } while (0)
|
||||||
|
|
||||||
|
#if !defined(GGML_USE_HIPBLAS)
|
||||||
|
static const char * cu_get_error_str(CUresult err) {
|
||||||
|
const char * err_str;
|
||||||
|
cuGetErrorString(err, &err_str);
|
||||||
|
return err_str;
|
||||||
|
}
|
||||||
|
#define CU_CHECK(err) do { auto err_ = (err); if (err_ != CUDA_SUCCESS) ggml_cuda_error(#err, __func__, __FILE__, __LINE__, cu_get_error_str(err_)); } while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#if CUDART_VERSION >= 11100
|
#if CUDART_VERSION >= 11100
|
||||||
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
||||||
|
@ -516,9 +528,17 @@ inline cudaError_t ggml_cuda_set_device(const int device) {
|
||||||
|
|
||||||
static int g_device_count = -1;
|
static int g_device_count = -1;
|
||||||
static int g_main_device = 0;
|
static int g_main_device = 0;
|
||||||
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
|
||||||
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
|
|
||||||
|
struct cuda_device_capabilities {
|
||||||
|
int cc; // compute capability
|
||||||
|
bool vmm; // virtual memory support
|
||||||
|
size_t vmm_granularity; // granularity of virtual memory
|
||||||
|
};
|
||||||
|
|
||||||
|
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
|
||||||
|
|
||||||
|
|
||||||
static void * g_scratch_buffer = nullptr;
|
static void * g_scratch_buffer = nullptr;
|
||||||
static size_t g_scratch_size = 0; // disabled by default
|
static size_t g_scratch_size = 0; // disabled by default
|
||||||
static size_t g_scratch_offset = 0;
|
static size_t g_scratch_offset = 0;
|
||||||
|
@ -5875,7 +5895,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_RDNA2) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
@ -5920,7 +5940,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_RDNA2) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
@ -5965,7 +5985,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_RDNA2) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
@ -6010,7 +6030,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_RDNA2) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
@ -6055,7 +6075,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_RDNA2) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
@ -6100,7 +6120,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_RDNA2) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
@ -6147,7 +6167,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_RDNA2) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
@ -6193,7 +6213,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_RDNA2) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
@ -6238,7 +6258,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_RDNA2) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
@ -6283,7 +6303,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
if (compute_capability >= CC_RDNA2) {
|
if (compute_capability >= CC_RDNA2) {
|
||||||
|
@ -6543,21 +6563,24 @@ struct scoped_spin_lock {
|
||||||
scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
|
scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
|
||||||
|
|
||||||
|
// #define DEBUG_CUDA_MALLOC
|
||||||
struct cuda_buffer {
|
struct cuda_buffer {
|
||||||
void * ptr = nullptr;
|
void * ptr = nullptr;
|
||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
|
static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
|
||||||
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
|
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
|
|
||||||
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
static void * ggml_cuda_pool_malloc_leg(size_t size, size_t * actual_size) {
|
||||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
#ifdef DEBUG_CUDA_MALLOC
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
int nnz = 0;
|
int nnz = 0;
|
||||||
size_t max_size = 0, tot_size = 0;
|
size_t max_size = 0;
|
||||||
#endif
|
#endif
|
||||||
size_t best_diff = 1ull << 36;
|
size_t best_diff = 1ull << 36;
|
||||||
int ibest = -1;
|
int ibest = -1;
|
||||||
|
@ -6566,7 +6589,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||||
if (b.ptr != nullptr) {
|
if (b.ptr != nullptr) {
|
||||||
#ifdef DEBUG_CUDA_MALLOC
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
++nnz;
|
++nnz;
|
||||||
tot_size += b.size;
|
|
||||||
if (b.size > max_size) max_size = b.size;
|
if (b.size > max_size) max_size = b.size;
|
||||||
#endif
|
#endif
|
||||||
if (b.size >= size) {
|
if (b.size >= size) {
|
||||||
|
@ -6593,19 +6615,20 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||||
b.size = 0;
|
b.size = 0;
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
#ifdef DEBUG_CUDA_MALLOC
|
|
||||||
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
|
||||||
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
|
||||||
#endif
|
|
||||||
void * ptr;
|
void * ptr;
|
||||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||||
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
||||||
*actual_size = look_ahead_size;
|
*actual_size = look_ahead_size;
|
||||||
|
g_cuda_pool_size[id] += look_ahead_size;
|
||||||
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
|
fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
|
||||||
|
(uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
|
||||||
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
static void ggml_cuda_pool_free_leg(void * ptr, size_t size) {
|
||||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
int id;
|
int id;
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
@ -6620,8 +6643,152 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
||||||
}
|
}
|
||||||
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
||||||
CUDA_CHECK(cudaFree(ptr));
|
CUDA_CHECK(cudaFree(ptr));
|
||||||
|
g_cuda_pool_size[id] -= size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if !defined(GGML_USE_HIPBLAS)
|
||||||
|
// pool with virtual memory
|
||||||
|
static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
|
||||||
|
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
|
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
|
||||||
|
|
||||||
|
static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
|
||||||
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
|
int id;
|
||||||
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
|
||||||
|
// round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
|
||||||
|
const size_t alignment = 128;
|
||||||
|
size = alignment * ((size + alignment - 1) / alignment);
|
||||||
|
|
||||||
|
size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id];
|
||||||
|
|
||||||
|
if (size > avail) {
|
||||||
|
// round up to the next multiple of the granularity
|
||||||
|
size_t reserve_size = size - avail;
|
||||||
|
const size_t granularity = g_device_caps[id].vmm_granularity;
|
||||||
|
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
|
||||||
|
|
||||||
|
GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
|
||||||
|
|
||||||
|
// allocate more physical memory
|
||||||
|
CUmemAllocationProp prop = {};
|
||||||
|
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||||
|
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||||
|
prop.location.id = id;
|
||||||
|
CUmemGenericAllocationHandle handle;
|
||||||
|
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
|
||||||
|
|
||||||
|
// reserve virtual address space (if not already reserved)
|
||||||
|
if (g_cuda_pool_addr[id] == 0) {
|
||||||
|
CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
// map at the end of the pool
|
||||||
|
CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, 0, handle, 0));
|
||||||
|
|
||||||
|
// set access
|
||||||
|
CUmemAccessDesc access = {};
|
||||||
|
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||||
|
access.location.id = id;
|
||||||
|
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||||
|
CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, &access, 1));
|
||||||
|
|
||||||
|
// add to the pool
|
||||||
|
g_cuda_pool_handles[id].push_back(handle);
|
||||||
|
g_cuda_pool_size[id] += reserve_size;
|
||||||
|
|
||||||
|
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
||||||
|
// id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
|
||||||
|
// (unsigned long long) (reserve_size/1024/1024));
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(g_cuda_pool_addr[id] != 0);
|
||||||
|
|
||||||
|
void * ptr = (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]);
|
||||||
|
*actual_size = size;
|
||||||
|
g_cuda_pool_used[id] += size;
|
||||||
|
|
||||||
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
|
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_pool_free_vmm(void * ptr, size_t size) {
|
||||||
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
|
int id;
|
||||||
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
|
||||||
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
|
printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
g_cuda_pool_used[id] -= size;
|
||||||
|
|
||||||
|
// all deallocations must be in reverse order of the allocations
|
||||||
|
GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||||
|
int id;
|
||||||
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
if (g_device_caps[id].vmm) {
|
||||||
|
return ggml_cuda_pool_malloc_vmm(size, actual_size);
|
||||||
|
} else {
|
||||||
|
return ggml_cuda_pool_malloc_leg(size, actual_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
||||||
|
int id;
|
||||||
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
if (g_device_caps[id].vmm) {
|
||||||
|
ggml_cuda_pool_free_vmm(ptr, size);
|
||||||
|
} else {
|
||||||
|
ggml_cuda_pool_free_leg(ptr, size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
|
||||||
|
#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
|
||||||
|
#endif // !defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
struct cuda_pool_alloc {
|
||||||
|
T * ptr = nullptr;
|
||||||
|
size_t actual_size = 0;
|
||||||
|
|
||||||
|
// size is in number of elements
|
||||||
|
T * alloc(size_t size) {
|
||||||
|
GGML_ASSERT(ptr == nullptr);
|
||||||
|
ptr = (T *) ggml_cuda_pool_malloc(size * sizeof(T), &this->actual_size);
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
cuda_pool_alloc(size_t size) {
|
||||||
|
alloc(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
~cuda_pool_alloc() {
|
||||||
|
if (ptr != nullptr) {
|
||||||
|
ggml_cuda_pool_free(ptr, actual_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
T * get() {
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
cuda_pool_alloc() = default;
|
||||||
|
cuda_pool_alloc(const cuda_pool_alloc &) = delete;
|
||||||
|
cuda_pool_alloc(cuda_pool_alloc &&) = delete;
|
||||||
|
cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
|
||||||
|
cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
static bool g_cublas_loaded = false;
|
static bool g_cublas_loaded = false;
|
||||||
|
|
||||||
bool ggml_cublas_loaded(void) {
|
bool ggml_cublas_loaded(void) {
|
||||||
|
@ -6660,16 +6827,33 @@ void ggml_init_cublas() {
|
||||||
#endif
|
#endif
|
||||||
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
|
int device_vmm = 0;
|
||||||
|
|
||||||
|
#if !defined(GGML_USE_HIPBLAS)
|
||||||
|
CUdevice device;
|
||||||
|
CU_CHECK(cuDeviceGet(&device, id));
|
||||||
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||||
|
|
||||||
|
if (device_vmm) {
|
||||||
|
CUmemAllocationProp alloc_prop = {};
|
||||||
|
alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||||
|
alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||||
|
alloc_prop.location.id = id;
|
||||||
|
CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
|
||||||
|
}
|
||||||
|
#endif // !defined(GGML_USE_HIPBLAS)
|
||||||
|
g_device_caps[id].vmm = !!device_vmm;
|
||||||
|
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||||
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
||||||
|
|
||||||
g_tensor_split[id] = total_vram;
|
g_tensor_split[id] = total_vram;
|
||||||
total_vram += prop.totalGlobalMem;
|
total_vram += prop.totalGlobalMem;
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
||||||
#else
|
#else
|
||||||
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
}
|
}
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
|
@ -6729,8 +6913,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
||||||
void * ptr = nullptr;
|
void * ptr = nullptr;
|
||||||
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
cudaError_t err = cudaMallocHost((void **) &ptr, size);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
// clear the error
|
||||||
// This can fixed the OOM error in WSL.
|
|
||||||
cudaGetLastError();
|
cudaGetLastError();
|
||||||
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
||||||
size/1024.0/1024.0, cudaGetErrorString(err));
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
||||||
|
@ -7179,11 +7362,11 @@ static int64_t get_row_rounding(ggml_type type) {
|
||||||
int64_t max_compute_capability = INT_MIN;
|
int64_t max_compute_capability = INT_MIN;
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||||
if (min_compute_capability > g_compute_capabilities[id]) {
|
if (min_compute_capability > g_device_caps[id].cc) {
|
||||||
min_compute_capability = g_compute_capabilities[id];
|
min_compute_capability = g_device_caps[id].cc;
|
||||||
}
|
}
|
||||||
if (max_compute_capability < g_compute_capabilities[id]) {
|
if (max_compute_capability < g_device_caps[id].cc) {
|
||||||
max_compute_capability = g_compute_capabilities[id];
|
max_compute_capability = g_device_caps[id].cc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7298,8 +7481,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||||
|
|
||||||
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
||||||
#ifdef GGML_CUDA_F16
|
#ifdef GGML_CUDA_F16
|
||||||
size_t ash;
|
cuda_pool_alloc<half> src1_dfloat_a;
|
||||||
dfloat * src1_dfloat = nullptr; // dfloat == half
|
half * src1_dfloat = nullptr; // dfloat == half
|
||||||
|
|
||||||
bool src1_convert_f16 =
|
bool src1_convert_f16 =
|
||||||
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
||||||
|
@ -7307,7 +7490,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||||
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
||||||
|
|
||||||
if (src1_convert_f16) {
|
if (src1_convert_f16) {
|
||||||
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
||||||
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
||||||
ne00, 1, sizeof(float), 0, 0,
|
ne00, 1, sizeof(float), 0, 0,
|
||||||
ne00, 1, sizeof(half), 0, 0, stream);
|
ne00, 1, sizeof(half), 0, 0, stream);
|
||||||
|
@ -7355,12 +7538,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_CUDA_F16
|
|
||||||
if (src1_convert_f16) {
|
|
||||||
ggml_cuda_pool_free(src1_dfloat, ash);
|
|
||||||
}
|
|
||||||
#endif // GGML_CUDA_F16
|
|
||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
(void) src1_ddq_i;
|
(void) src1_ddq_i;
|
||||||
|
@ -7391,33 +7568,30 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||||
// ldc == nrows of the matrix that cuBLAS writes into
|
// ldc == nrows of the matrix that cuBLAS writes into
|
||||||
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
|
||||||
|
|
||||||
const int compute_capability = g_compute_capabilities[id];
|
const int compute_capability = g_device_caps[id].cc;
|
||||||
|
|
||||||
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||||
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
||||||
half * src0_as_f16 = nullptr;
|
cuda_pool_alloc<half> src0_as_f16;
|
||||||
size_t src0_as = 0;
|
|
||||||
if (src0->type != GGML_TYPE_F16) {
|
if (src0->type != GGML_TYPE_F16) {
|
||||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
||||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||||
size_t ne = row_diff*ne00;
|
size_t ne = row_diff*ne00;
|
||||||
src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
|
src0_as_f16.alloc(ne);
|
||||||
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
|
to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
|
||||||
}
|
}
|
||||||
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
|
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
|
||||||
|
|
||||||
half * src1_as_f16 = nullptr;
|
cuda_pool_alloc<half> src1_as_f16;
|
||||||
size_t src1_as = 0;
|
|
||||||
if (src1->type != GGML_TYPE_F16) {
|
if (src1->type != GGML_TYPE_F16) {
|
||||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||||
size_t ne = src1_ncols*ne10;
|
size_t ne = src1_ncols*ne10;
|
||||||
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
src1_as_f16.alloc(ne);
|
||||||
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
|
||||||
}
|
}
|
||||||
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
|
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
|
||||||
size_t dst_as = 0;
|
cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);
|
||||||
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
|
||||||
|
|
||||||
const half alpha_f16 = 1.0f;
|
const half alpha_f16 = 1.0f;
|
||||||
const half beta_f16 = 0.0f;
|
const half beta_f16 = 0.0f;
|
||||||
|
@ -7426,36 +7600,25 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
row_diff, src1_ncols, ne10,
|
row_diff, src1_ncols, ne10,
|
||||||
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
||||||
src1_ptr, CUDA_R_16F, ne10,
|
src1_ptr, CUDA_R_16F, ne10,
|
||||||
&beta_f16, dst_f16, CUDA_R_16F, ldc,
|
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
||||||
CUBLAS_COMPUTE_16F,
|
CUBLAS_COMPUTE_16F,
|
||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
|
|
||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||||
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
|
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||||
|
|
||||||
ggml_cuda_pool_free(dst_f16, dst_as);
|
|
||||||
|
|
||||||
if (src0_as != 0) {
|
|
||||||
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (src1_as != 0) {
|
|
||||||
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
float * src0_ddq_as_f32 = nullptr;
|
cuda_pool_alloc<float> src0_ddq_as_f32;
|
||||||
size_t src0_as = 0;
|
|
||||||
|
|
||||||
if (src0->type != GGML_TYPE_F32) {
|
if (src0->type != GGML_TYPE_F32) {
|
||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
||||||
GGML_ASSERT(to_fp32_cuda != nullptr);
|
GGML_ASSERT(to_fp32_cuda != nullptr);
|
||||||
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
src0_ddq_as_f32.alloc(row_diff*ne00);
|
||||||
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
|
||||||
}
|
}
|
||||||
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
||||||
|
|
||||||
const float alpha = 1.0f;
|
const float alpha = 1.0f;
|
||||||
const float beta = 0.0f;
|
const float beta = 0.0f;
|
||||||
|
@ -7467,10 +7630,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||||
&alpha, src0_ddf_i, ne00,
|
&alpha, src0_ddf_i, ne00,
|
||||||
src1_ddf_i, ne10,
|
src1_ddf_i, ne10,
|
||||||
&beta, dst_dd_i, ldc));
|
&beta, dst_dd_i, ldc));
|
||||||
|
|
||||||
if (src0_as != 0) {
|
|
||||||
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) dst;
|
(void) dst;
|
||||||
|
@ -7762,18 +7921,17 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
float * src1_ddf = nullptr;
|
float * src1_ddf = nullptr;
|
||||||
float * dst_ddf = nullptr;
|
float * dst_ddf = nullptr;
|
||||||
|
|
||||||
// as = actual size
|
cuda_pool_alloc<float> src0_f;
|
||||||
size_t src0_asf = 0;
|
cuda_pool_alloc<float> src1_f;
|
||||||
size_t src1_asf = 0;
|
cuda_pool_alloc<float> dst_f;
|
||||||
size_t dst_asf = 0;
|
|
||||||
|
|
||||||
ggml_cuda_set_device(g_main_device);
|
ggml_cuda_set_device(g_main_device);
|
||||||
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
if (src0_on_device) {
|
if (src0_on_device) {
|
||||||
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
||||||
} else {
|
} else {
|
||||||
src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
|
src0_ddf = src0_f.alloc(ggml_nelements(src0));
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7781,14 +7939,14 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
if (src1_on_device) {
|
if (src1_on_device) {
|
||||||
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||||
} else {
|
} else {
|
||||||
src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
|
src1_ddf = src1_f.alloc(ggml_nelements(src1));
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (dst_on_device) {
|
if (dst_on_device) {
|
||||||
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||||
} else {
|
} else {
|
||||||
dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
|
dst_ddf = dst_f.alloc(ggml_nelements(dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
// do the computation
|
// do the computation
|
||||||
|
@ -7800,16 +7958,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src0_asf > 0) {
|
|
||||||
ggml_cuda_pool_free(src0_ddf, src0_asf);
|
|
||||||
}
|
|
||||||
if (src1_asf > 0) {
|
|
||||||
ggml_cuda_pool_free(src1_ddf, src1_asf);
|
|
||||||
}
|
|
||||||
if (dst_asf > 0) {
|
|
||||||
ggml_cuda_pool_free(dst_ddf, dst_asf);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_CPU) {
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
@ -7937,12 +8085,16 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
if (id != 0) {
|
if (id != 0) {
|
||||||
row_low[id] = ne01*g_tensor_split[id];
|
row_low[id] = ne01*g_tensor_split[id];
|
||||||
row_low[id] -= row_low[id] % rounding;
|
if (row_low[id] < ne01) {
|
||||||
|
row_low[id] -= row_low[id] % rounding;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (id != g_device_count - 1) {
|
if (id != g_device_count - 1) {
|
||||||
row_high[id] = ne01*g_tensor_split[id + 1];
|
row_high[id] = ne01*g_tensor_split[id + 1];
|
||||||
row_high[id] -= row_high[id] % rounding;
|
if (row_high[id] < ne01) {
|
||||||
|
row_high[id] -= row_high[id] % rounding;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8119,17 +8271,17 @@ static void ggml_cuda_op_mul_mat(
|
||||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||||
|
|
||||||
// free buffers again when done
|
// free buffers again when done
|
||||||
if (src0_as[id] > 0) {
|
if (dst_as[id] > 0) {
|
||||||
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
|
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
|
||||||
}
|
|
||||||
if (src1_asf[id] > 0) {
|
|
||||||
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
|
|
||||||
}
|
}
|
||||||
if (src1_asq[id] > 0) {
|
if (src1_asq[id] > 0) {
|
||||||
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
|
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
|
||||||
}
|
}
|
||||||
if (dst_as[id] > 0) {
|
if (src1_asf[id] > 0) {
|
||||||
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
|
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
|
||||||
|
}
|
||||||
|
if (src0_as[id] > 0) {
|
||||||
|
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8382,14 +8534,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||||
|
|
||||||
size_t src1_as = 0;
|
cuda_pool_alloc<half> src1_as_f16(ne1);
|
||||||
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
|
||||||
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
|
||||||
|
|
||||||
size_t dst_as = 0;
|
cuda_pool_alloc<half> dst_f16;
|
||||||
|
char * dst_t;
|
||||||
half * dst_f16 = nullptr;
|
|
||||||
char * dst_t = nullptr;
|
|
||||||
|
|
||||||
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||||
cudaDataType_t cu_data_type = CUDA_R_16F;
|
cudaDataType_t cu_data_type = CUDA_R_16F;
|
||||||
|
@ -8408,8 +8557,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||||
const void * beta = &beta_f16;
|
const void * beta = &beta_f16;
|
||||||
|
|
||||||
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||||
dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
dst_t = (char *) dst_f16.alloc(ne);
|
||||||
dst_t = (char *) dst_f16;
|
|
||||||
|
|
||||||
nbd2 /= sizeof(float) / sizeof(half);
|
nbd2 /= sizeof(float) / sizeof(half);
|
||||||
nbd3 /= sizeof(float) / sizeof(half);
|
nbd3 /= sizeof(float) / sizeof(half);
|
||||||
|
@ -8456,9 +8604,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
ne01, ne11, ne10,
|
ne01, ne11, ne10,
|
||||||
alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
||||||
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
(const char *) src1_as_f16.get(), CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
||||||
beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
|
beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
|
||||||
ne12*ne13,
|
ne12*ne13,
|
||||||
cu_compute_type,
|
cu_compute_type,
|
||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
|
@ -8466,19 +8614,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||||
// use cublasGemmBatchedEx
|
// use cublasGemmBatchedEx
|
||||||
const int ne23 = ne12*ne13;
|
const int ne23 = ne12*ne13;
|
||||||
|
|
||||||
const void ** ptrs_src = nullptr;
|
cuda_pool_alloc<const void *> ptrs_src(2*ne23);
|
||||||
void ** ptrs_dst = nullptr;
|
cuda_pool_alloc< void *> ptrs_dst(1*ne23);
|
||||||
|
|
||||||
size_t ptrs_src_s = 0;
|
|
||||||
size_t ptrs_dst_s = 0;
|
|
||||||
|
|
||||||
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
|
||||||
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
|
||||||
|
|
||||||
dim3 block_dims(ne13, ne12);
|
dim3 block_dims(ne13, ne12);
|
||||||
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
||||||
src0_as_f16, src1_as_f16, dst_t,
|
src0_as_f16, src1_as_f16.get(), dst_t,
|
||||||
ptrs_src, ptrs_dst,
|
ptrs_src.get(), ptrs_dst.get(),
|
||||||
ne12, ne13,
|
ne12, ne13,
|
||||||
ne23,
|
ne23,
|
||||||
nb02, nb03,
|
nb02, nb03,
|
||||||
|
@ -8490,30 +8632,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
ne01, ne11, ne10,
|
ne01, ne11, ne10,
|
||||||
alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||||
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||||
beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
|
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
|
||||||
ne23,
|
ne23,
|
||||||
cu_compute_type,
|
cu_compute_type,
|
||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
|
|
||||||
if (ptrs_src_s != 0) {
|
|
||||||
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
|
||||||
}
|
|
||||||
if (ptrs_dst_s != 0) {
|
|
||||||
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||||
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
|
||||||
|
|
||||||
ggml_cuda_pool_free(dst_f16, dst_as);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
@ -8526,8 +8657,8 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||||
min_compute_capability = g_compute_capabilities[id];
|
min_compute_capability = g_device_caps[id].cc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8840,12 +8971,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
size_t as_src1, as_dst;
|
cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
|
||||||
char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
|
cuda_pool_alloc<char> dst_contiguous(sizeof(float)*ggml_nelements(dst));
|
||||||
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
|
|
||||||
|
|
||||||
src1_row_extra.data_device[g_main_device] = src1_contiguous;
|
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
||||||
dst_row_extra.data_device[g_main_device] = dst_contiguous;
|
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
||||||
|
|
||||||
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
||||||
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
||||||
|
@ -8865,7 +8995,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
|
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
|
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
|
||||||
nb11, src1_kind, stream));
|
nb11, src1_kind, stream));
|
||||||
num_src1_rows++;
|
num_src1_rows++;
|
||||||
}
|
}
|
||||||
|
@ -8897,14 +9027,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
|
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
|
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
|
||||||
nb1, dst_kind, stream));
|
nb1, dst_kind, stream));
|
||||||
num_src1_rows++;
|
num_src1_rows++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cuda_pool_free(src1_contiguous, as_src1);
|
|
||||||
ggml_cuda_pool_free(dst_contiguous, as_dst);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_CPU) {
|
if (dst->backend == GGML_BACKEND_CPU) {
|
||||||
|
@ -9670,12 +9797,16 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||||
// host buffer type
|
// host buffer type
|
||||||
|
|
||||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
ggml_cuda_host_free(buffer->context);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
void * ptr;
|
void * ptr = ggml_cuda_host_malloc(size);
|
||||||
CUDA_CHECK(cudaMallocHost(&ptr, size));
|
|
||||||
|
if (ptr == nullptr) {
|
||||||
|
// fallback to cpu buffer
|
||||||
|
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||||
|
}
|
||||||
|
|
||||||
// FIXME: this is a hack to avoid having to implement a new buffer type
|
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
|
|
6
ggml.c
6
ggml.c
|
@ -17456,9 +17456,9 @@ static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// ADAM
|
// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
|
||||||
//
|
//
|
||||||
// ref: https://arxiv.org/pdf/1412.6980.pdf
|
// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
|
||||||
//
|
//
|
||||||
|
|
||||||
static enum ggml_opt_result ggml_opt_adam(
|
static enum ggml_opt_result ggml_opt_adam(
|
||||||
|
@ -19351,7 +19351,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
||||||
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
||||||
}
|
}
|
||||||
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
||||||
free(data);
|
free((void *)data);
|
||||||
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
||||||
GGML_ASSERT(false && "nested arrays not supported");
|
GGML_ASSERT(false && "nested arrays not supported");
|
||||||
} else {
|
} else {
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -255,6 +255,8 @@
|
||||||
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
||||||
#elif defined(__GNUC__)
|
#elif defined(__GNUC__)
|
||||||
#define GGML_UNREACHABLE() __builtin_unreachable()
|
#define GGML_UNREACHABLE() __builtin_unreachable()
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
#define GGML_UNREACHABLE() __assume(0)
|
||||||
#else
|
#else
|
||||||
#define GGML_UNREACHABLE() ((void) 0)
|
#define GGML_UNREACHABLE() ((void) 0)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -96,6 +96,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
QWEN = auto()
|
QWEN = auto()
|
||||||
PHI2 = auto()
|
PHI2 = auto()
|
||||||
|
PLAMO = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -142,6 +143,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
MODEL_ARCH.PHI2: "phi2",
|
MODEL_ARCH.PHI2: "phi2",
|
||||||
|
MODEL_ARCH.PLAMO: "plamo",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -349,6 +351,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.PLAMO: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.GPT2: [
|
MODEL_ARCH.GPT2: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.POS_EMBD,
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
|
|
@ -82,6 +82,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.ln1", # yi
|
"model.layers.{bid}.ln1", # yi
|
||||||
"h.{bid}.ln_1", # gpt2
|
"h.{bid}.ln_1", # gpt2
|
||||||
"transformer.h.{bid}.ln", # phi2
|
"transformer.h.{bid}.ln", # phi2
|
||||||
|
"model.layers.layers.{bid}.norm", # plamo
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
|
@ -103,26 +104,29 @@ class TensorNameMap:
|
||||||
|
|
||||||
# Attention query
|
# Attention query
|
||||||
MODEL_TENSOR.ATTN_Q: (
|
MODEL_TENSOR.ATTN_Q: (
|
||||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wq", # llama-pth
|
"layers.{bid}.attention.wq", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.query", # bert
|
"encoder.layer.{bid}.attention.self.query", # bert
|
||||||
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||||
|
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention key
|
# Attention key
|
||||||
MODEL_TENSOR.ATTN_K: (
|
MODEL_TENSOR.ATTN_K: (
|
||||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wk", # llama-pth
|
"layers.{bid}.attention.wk", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.key", # bert
|
"encoder.layer.{bid}.attention.self.key", # bert
|
||||||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||||
|
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention value
|
# Attention value
|
||||||
MODEL_TENSOR.ATTN_V: (
|
MODEL_TENSOR.ATTN_V: (
|
||||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wv", # llama-pth
|
"layers.{bid}.attention.wv", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.value", # bert
|
"encoder.layer.{bid}.attention.self.value", # bert
|
||||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||||
|
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output
|
# Attention output
|
||||||
|
@ -139,12 +143,14 @@ class TensorNameMap:
|
||||||
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
||||||
"h.{bid}.attn.c_proj", # gpt2
|
"h.{bid}.attn.c_proj", # gpt2
|
||||||
"transformer.h.{bid}.mixer.out_proj", # phi2
|
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||||
|
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||||
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
|
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
|
||||||
|
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward norm
|
# Feed-forward norm
|
||||||
|
@ -181,6 +187,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mlp.w1", # qwen
|
"transformer.h.{bid}.mlp.w1", # qwen
|
||||||
"h.{bid}.mlp.c_fc", # gpt2
|
"h.{bid}.mlp.c_fc", # gpt2
|
||||||
"transformer.h.{bid}.mlp.fc1", # phi2
|
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||||
|
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
|
@ -193,6 +200,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||||
"transformer.h.{bid}.mlp.w2", # qwen
|
"transformer.h.{bid}.mlp.w2", # qwen
|
||||||
|
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
|
@ -214,6 +222,7 @@ class TensorNameMap:
|
||||||
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||||
"h.{bid}.mlp.c_proj", # gpt2
|
"h.{bid}.mlp.c_proj", # gpt2
|
||||||
"transformer.h.{bid}.mlp.fc2", # phi2
|
"transformer.h.{bid}.mlp.fc2", # phi2
|
||||||
|
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
|
|
203
llama.cpp
203
llama.cpp
|
@ -198,6 +198,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_STABLELM,
|
LLM_ARCH_STABLELM,
|
||||||
LLM_ARCH_QWEN,
|
LLM_ARCH_QWEN,
|
||||||
LLM_ARCH_PHI2,
|
LLM_ARCH_PHI2,
|
||||||
|
LLM_ARCH_PLAMO,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -216,6 +217,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||||
{ LLM_ARCH_QWEN, "qwen" },
|
{ LLM_ARCH_QWEN, "qwen" },
|
||||||
{ LLM_ARCH_PHI2, "phi2" },
|
{ LLM_ARCH_PHI2, "phi2" },
|
||||||
|
{ LLM_ARCH_PLAMO, "plamo" },
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llm_kv {
|
enum llm_kv {
|
||||||
|
@ -576,6 +578,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_PLAMO,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
|
@ -1186,21 +1206,27 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
||||||
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
return ggml_backend_metal_buffer_type();
|
buft = ggml_backend_metal_buffer_type();
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
||||||
if (n_gpu_layers > 0) {
|
if (n_gpu_layers > 0) {
|
||||||
return ggml_backend_cuda_buffer_type(0);
|
buft = ggml_backend_cuda_buffer_type(0);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUBLAS)
|
||||||
return ggml_backend_cuda_host_buffer_type();
|
buft = ggml_backend_cuda_host_buffer_type();
|
||||||
#elif defined(GGML_USE_CPU_HBM)
|
#elif defined(GGML_USE_CPU_HBM)
|
||||||
return ggml_backend_cpu_hbm_buffer_type();
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return ggml_backend_cpu_buffer_type();
|
if (buft == nullptr) {
|
||||||
|
buft = ggml_backend_cpu_buffer_type();
|
||||||
|
}
|
||||||
|
|
||||||
|
return buft;
|
||||||
|
|
||||||
GGML_UNUSED(n_gpu_layers);
|
GGML_UNUSED(n_gpu_layers);
|
||||||
}
|
}
|
||||||
|
@ -1288,7 +1314,7 @@ struct llama_hparams {
|
||||||
if (this->rope_finetuned != other.rope_finetuned) return true;
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
||||||
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
||||||
|
|
||||||
const float EPSILON = 1e-9;
|
const float EPSILON = 1e-9f;
|
||||||
|
|
||||||
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
||||||
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
||||||
|
@ -2760,6 +2786,15 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_PLAMO:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_GPT2:
|
case LLM_ARCH_GPT2:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
@ -3652,6 +3687,51 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_PLAMO:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
ggml_backend_type backend_norm;
|
||||||
|
ggml_backend_type backend_output;
|
||||||
|
|
||||||
|
if (n_gpu_layers > int(n_layer)) {
|
||||||
|
backend_norm = llama_backend_offload;
|
||||||
|
backend_output = llama_backend_offload_split;
|
||||||
|
} else {
|
||||||
|
backend_norm = GGML_BACKEND_CPU;
|
||||||
|
backend_output = GGML_BACKEND_CPU;
|
||||||
|
}
|
||||||
|
|
||||||
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||||
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t n_ff = hparams.n_ff;
|
||||||
|
|
||||||
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
|
||||||
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
|
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
||||||
|
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
|
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
||||||
|
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
||||||
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
|
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_GPT2:
|
case LLM_ARCH_GPT2:
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
|
@ -5650,6 +5730,109 @@ struct llm_build_context {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_plamo() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||||
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||||
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||||
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
|
// shift the entire K-cache if needed
|
||||||
|
if (do_rope_shift) {
|
||||||
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * attention_norm = cur;
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_custom(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
|
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_custom(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
|
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||||
|
|
||||||
|
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||||
|
model.layers[il].wo, NULL,
|
||||||
|
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
cb(cur, "kqv_out", il);
|
||||||
|
}
|
||||||
|
struct ggml_tensor * sa_out = cur;
|
||||||
|
|
||||||
|
cur = attention_norm;
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
{
|
||||||
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
|
model.layers[il].ffn_up, NULL,
|
||||||
|
model.layers[il].ffn_gate, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, sa_out);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_gpt2() {
|
struct ggml_cgraph * build_gpt2() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
@ -6258,6 +6441,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_phi2();
|
result = llm.build_phi2();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_PLAMO:
|
||||||
|
{
|
||||||
|
result = llm.build_plamo();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_GPT2:
|
case LLM_ARCH_GPT2:
|
||||||
{
|
{
|
||||||
result = llm.build_gpt2();
|
result = llm.build_gpt2();
|
||||||
|
@ -10497,7 +10684,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
llama_unescape_whitespace(result);
|
llama_unescape_whitespace(result);
|
||||||
if (length < (int) result.length()) {
|
if (length < (int) result.length()) {
|
||||||
return -result.length();
|
return -(int) result.length();
|
||||||
}
|
}
|
||||||
memcpy(buf, result.c_str(), result.length());
|
memcpy(buf, result.c_str(), result.length());
|
||||||
return result.length();
|
return result.length();
|
||||||
|
@ -10527,7 +10714,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
result = llama_decode_text(result);
|
result = llama_decode_text(result);
|
||||||
if (length < (int) result.length()) {
|
if (length < (int) result.length()) {
|
||||||
return -result.length();
|
return -(int) result.length();
|
||||||
}
|
}
|
||||||
memcpy(buf, result.c_str(), result.length());
|
memcpy(buf, result.c_str(), result.length());
|
||||||
return result.length();
|
return result.length();
|
||||||
|
|
|
@ -883,9 +883,6 @@ int main(int argc, const char ** argv) {
|
||||||
srand(seed);
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
int64_t ne2[4];
|
|
||||||
ne2[0] = 1;
|
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 2; ++ndims) {
|
for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue