Merge branch 'ggerganov:master' into server-ui-improvements
This commit is contained in:
commit
97f4ec4631
30 changed files with 1035 additions and 327 deletions
|
@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
./quantize "$@"
|
./quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
./main "$@"
|
./main "$@"
|
||||||
|
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||||
|
./finetune "$@"
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
@ -34,6 +36,8 @@ else
|
||||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||||
|
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
|
||||||
|
echo " See documentation for finetune for command-line parameters"
|
||||||
echo " --all-in-one (-a): Execute --convert & --quantize"
|
echo " --all-in-one (-a): Execute --convert & --quantize"
|
||||||
echo " ex: \"/models/\" 7B"
|
echo " ex: \"/models/\" 7B"
|
||||||
echo " --server (-s): Run a model on the server"
|
echo " --server (-s): Run a model on the server"
|
||||||
|
|
25
.gitignore
vendored
25
.gitignore
vendored
|
@ -88,15 +88,16 @@ poetry.lock
|
||||||
poetry.toml
|
poetry.toml
|
||||||
|
|
||||||
# Test binaries
|
# Test binaries
|
||||||
tests/test-grammar-parser
|
/tests/test-grammar-parser
|
||||||
tests/test-llama-grammar
|
/tests/test-llama-grammar
|
||||||
tests/test-double-float
|
/tests/test-double-float
|
||||||
tests/test-grad0
|
/tests/test-grad0
|
||||||
tests/test-opt
|
/tests/test-opt
|
||||||
tests/test-quantize-fns
|
/tests/test-quantize-fns
|
||||||
tests/test-quantize-perf
|
/tests/test-quantize-perf
|
||||||
tests/test-sampling
|
/tests/test-sampling
|
||||||
tests/test-tokenizer-0-llama
|
/tests/test-tokenizer-0-llama
|
||||||
tests/test-tokenizer-0-falcon
|
/tests/test-tokenizer-0-falcon
|
||||||
tests/test-tokenizer-1-llama
|
/tests/test-tokenizer-1-llama
|
||||||
tests/test-tokenizer-1-bpe
|
/tests/test-tokenizer-1-bpe
|
||||||
|
/tests/test-rope
|
||||||
|
|
|
@ -116,6 +116,11 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
include(CheckCXXCompilerFlag)
|
include(CheckCXXCompilerFlag)
|
||||||
|
|
||||||
|
# enable libstdc++ assertions for debug builds
|
||||||
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||||
|
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
if (LLAMA_SANITIZE_THREAD)
|
if (LLAMA_SANITIZE_THREAD)
|
||||||
add_compile_options(-fsanitize=thread)
|
add_compile_options(-fsanitize=thread)
|
||||||
|
@ -165,7 +170,7 @@ if (LLAMA_METAL)
|
||||||
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
|
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
|
||||||
|
|
||||||
# copy ggml-metal.metal to bin directory
|
# copy ggml-metal.metal to bin directory
|
||||||
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
|
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
|
||||||
${FOUNDATION_LIBRARY}
|
${FOUNDATION_LIBRARY}
|
||||||
|
|
29
Makefile
29
Makefile
|
@ -8,7 +8,7 @@ BUILD_TARGETS = \
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
||||||
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -30,7 +30,7 @@ ifeq '' '$(findstring clang,$(shell $(CC) --version))'
|
||||||
CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
||||||
else
|
else
|
||||||
CC_IS_CLANG=1
|
CC_IS_CLANG=1
|
||||||
ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
|
ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
|
||||||
CC_IS_LLVM_CLANG=1
|
CC_IS_LLVM_CLANG=1
|
||||||
else
|
else
|
||||||
CC_IS_APPLE_CLANG=1
|
CC_IS_APPLE_CLANG=1
|
||||||
|
@ -174,6 +174,10 @@ ifdef LLAMA_DEBUG
|
||||||
MK_CFLAGS += -O0 -g
|
MK_CFLAGS += -O0 -g
|
||||||
MK_CXXFLAGS += -O0 -g
|
MK_CXXFLAGS += -O0 -g
|
||||||
MK_LDFLAGS += -g
|
MK_LDFLAGS += -g
|
||||||
|
|
||||||
|
ifeq ($(UNAME_S),Linux)
|
||||||
|
MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
MK_CPPFLAGS += -DNDEBUG
|
MK_CPPFLAGS += -DNDEBUG
|
||||||
endif
|
endif
|
||||||
|
@ -648,7 +652,7 @@ beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS)
|
||||||
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||||
|
@ -701,28 +705,28 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
@ -737,5 +741,8 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
|
||||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-c.o: tests/test-c.c llama.h
|
tests/test-c.o: tests/test-c.c llama.h
|
||||||
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||||
|
|
|
@ -122,6 +122,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
|
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
|
||||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||||
- [semperai/amica](https://github.com/semperai/amica)
|
- [semperai/amica](https://github.com/semperai/amica)
|
||||||
|
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -328,7 +329,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
||||||
|
|
||||||
### BLAS Build
|
### BLAS Build
|
||||||
|
|
||||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
|
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
|
||||||
|
|
||||||
- #### Accelerate Framework:
|
- #### Accelerate Framework:
|
||||||
|
|
||||||
|
@ -900,7 +901,7 @@ Additionally, there the following images, similar to the above:
|
||||||
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
|
||||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||||
|
|
||||||
#### Usage
|
#### Usage
|
||||||
|
|
||||||
|
|
|
@ -11,8 +11,13 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||||
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
||||||
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
||||||
|
string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
|
||||||
|
if (SLASH_POS EQUAL 0)
|
||||||
|
set(GIT_DIR "${REAL_GIT_DIR}")
|
||||||
|
else()
|
||||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||||
endif()
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
set(GIT_INDEX "${GIT_DIR}/index")
|
set(GIT_INDEX "${GIT_DIR}/index")
|
||||||
else()
|
else()
|
||||||
|
|
|
@ -10,7 +10,7 @@ import re
|
||||||
import sys
|
import sys
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
|
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
@ -168,6 +168,8 @@ class Model:
|
||||||
return PersimmonModel
|
return PersimmonModel
|
||||||
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||||
return StableLMModel
|
return StableLMModel
|
||||||
|
if model_architecture == "QWenLMHeadModel":
|
||||||
|
return QwenModel
|
||||||
return Model
|
return Model
|
||||||
|
|
||||||
def _is_model_safetensors(self) -> bool:
|
def _is_model_safetensors(self) -> bool:
|
||||||
|
@ -203,6 +205,8 @@ class Model:
|
||||||
return gguf.MODEL_ARCH.PERSIMMON
|
return gguf.MODEL_ARCH.PERSIMMON
|
||||||
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
|
||||||
return gguf.MODEL_ARCH.STABLELM
|
return gguf.MODEL_ARCH.STABLELM
|
||||||
|
if arch == "QWenLMHeadModel":
|
||||||
|
return gguf.MODEL_ARCH.QWEN
|
||||||
|
|
||||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||||
|
|
||||||
|
@ -832,6 +836,131 @@ class StableLMModel(Model):
|
||||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||||
|
|
||||||
|
|
||||||
|
class QwenModel(Model):
|
||||||
|
@staticmethod
|
||||||
|
def token_bytes_to_string(b):
|
||||||
|
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
||||||
|
byte_encoder = bytes_to_unicode()
|
||||||
|
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
|
||||||
|
parts = [bytes([b]) for b in token]
|
||||||
|
while True:
|
||||||
|
min_idx = None
|
||||||
|
min_rank = None
|
||||||
|
for i, pair in enumerate(zip(parts[:-1], parts[1:])):
|
||||||
|
rank = mergeable_ranks.get(pair[0] + pair[1])
|
||||||
|
if rank is not None and (min_rank is None or rank < min_rank):
|
||||||
|
min_idx = i
|
||||||
|
min_rank = rank
|
||||||
|
if min_rank is None or (max_rank is not None and min_rank >= max_rank):
|
||||||
|
break
|
||||||
|
assert min_idx is not None
|
||||||
|
parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
|
||||||
|
return parts
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
dir_model = self.dir_model
|
||||||
|
hparams = self.hparams
|
||||||
|
tokens: list[bytearray] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer # type: ignore[attr-defined]
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||||
|
vocab_size = hparams["vocab_size"]
|
||||||
|
assert max(tokenizer.get_vocab().values()) < vocab_size
|
||||||
|
|
||||||
|
merges = []
|
||||||
|
vocab = {}
|
||||||
|
mergeable_ranks = tokenizer.mergeable_ranks
|
||||||
|
for token, rank in mergeable_ranks.items():
|
||||||
|
vocab[self.token_bytes_to_string(token)] = rank
|
||||||
|
if len(token) == 1:
|
||||||
|
continue
|
||||||
|
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||||
|
assert len(merged) == 2
|
||||||
|
merges.append(' '.join(map(self.token_bytes_to_string, merged)))
|
||||||
|
|
||||||
|
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
|
||||||
|
added_vocab = tokenizer.special_tokens
|
||||||
|
|
||||||
|
for i in range(vocab_size):
|
||||||
|
if i not in reverse_vocab:
|
||||||
|
pad_token = f"[PAD{i}]".encode("utf-8")
|
||||||
|
tokens.append(bytearray(pad_token))
|
||||||
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
elif reverse_vocab[i] in added_vocab:
|
||||||
|
tokens.append(reverse_vocab[i])
|
||||||
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
|
else:
|
||||||
|
tokens.append(reverse_vocab[i])
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
||||||
|
special_vocab.merges = merges
|
||||||
|
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
|
||||||
|
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
|
||||||
|
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_name("Qwen")
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
model_kv = dict(self.get_tensors())
|
||||||
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
for name, data_torch in model_kv.items():
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith(".rotary_emb.inv_freq"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||||
|
data_torch = data_torch.to(torch.float32)
|
||||||
|
|
||||||
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print(f"Can not map tensor {name!r}")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if self.ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -267,7 +267,7 @@ class Params:
|
||||||
n_ctx = 2048
|
n_ctx = 2048
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
|
n_vocab = model["tok_embeddings.weight"].shape[0],
|
||||||
n_embd = config["dim"],
|
n_embd = config["dim"],
|
||||||
n_layer = config["n_layers"],
|
n_layer = config["n_layers"],
|
||||||
n_ctx = n_ctx,
|
n_ctx = n_ctx,
|
||||||
|
|
|
@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
|
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
This is a swift clone of `examples/batched`.
|
This is a swift clone of `examples/batched`.
|
||||||
|
|
||||||
$ `make`
|
$ `make`
|
||||||
$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
|
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
|
||||||
|
|
|
@ -230,18 +230,15 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
|
||||||
var result = [CChar](repeating: 0, count: 8)
|
var result = [CChar](repeating: 0, count: 8)
|
||||||
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
|
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
|
||||||
if nTokens < 0 {
|
if nTokens < 0 {
|
||||||
if result.count >= -Int(nTokens) {
|
let actualTokensCount = -Int(nTokens)
|
||||||
result.removeLast(-Int(nTokens))
|
result = .init(repeating: 0, count: actualTokensCount)
|
||||||
} else {
|
|
||||||
result.removeAll()
|
|
||||||
}
|
|
||||||
let check = llama_token_to_piece(
|
let check = llama_token_to_piece(
|
||||||
model,
|
model,
|
||||||
token,
|
token,
|
||||||
&result,
|
&result,
|
||||||
Int32(result.count)
|
Int32(result.count)
|
||||||
)
|
)
|
||||||
assert(check == nTokens)
|
assert(check == actualTokensCount)
|
||||||
} else {
|
} else {
|
||||||
result.removeLast(result.count - Int(nTokens))
|
result.removeLast(result.count - Int(nTokens))
|
||||||
}
|
}
|
||||||
|
@ -259,5 +256,4 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
|
||||||
buffer = []
|
buffer = []
|
||||||
return bufferString
|
return bufferString
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -164,13 +164,21 @@ actor LlamaContext {
|
||||||
private func token_to_piece(token: llama_token) -> String {
|
private func token_to_piece(token: llama_token) -> String {
|
||||||
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
|
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
|
||||||
result.initialize(repeating: Int8(0), count: 8)
|
result.initialize(repeating: Int8(0), count: 8)
|
||||||
|
defer {
|
||||||
let _ = llama_token_to_piece(model, token, result, 8)
|
|
||||||
|
|
||||||
let resultStr = String(cString: result)
|
|
||||||
|
|
||||||
result.deallocate()
|
result.deallocate()
|
||||||
|
}
|
||||||
|
let nTokens = llama_token_to_piece(model, token, result, 8)
|
||||||
|
|
||||||
return resultStr
|
if nTokens < 0 {
|
||||||
|
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
|
||||||
|
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
|
||||||
|
defer {
|
||||||
|
newResult.deallocate()
|
||||||
|
}
|
||||||
|
_ = llama_token_to_piece(model, token, newResult, -nTokens)
|
||||||
|
return String(cString: newResult)
|
||||||
|
} else {
|
||||||
|
return String(cString: result)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ import json
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from gguf import *
|
from gguf import *
|
||||||
from transformers import CLIPModel, CLIPProcessor
|
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
|
||||||
|
|
||||||
TEXT = "clip.text"
|
TEXT = "clip.text"
|
||||||
VISION = "clip.vision"
|
VISION = "clip.vision"
|
||||||
|
@ -78,11 +78,19 @@ ap.add_argument("--text-only", action="store_true", required=False,
|
||||||
help="Save a text-only model. It can't be used to encode images")
|
help="Save a text-only model. It can't be used to encode images")
|
||||||
ap.add_argument("--vision-only", action="store_true", required=False,
|
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||||
help="Save a vision-only model. It can't be used to encode texts")
|
help="Save a vision-only model. It can't be used to encode texts")
|
||||||
|
ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
|
||||||
|
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||||
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
|
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
|
||||||
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
|
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
|
||||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||||
|
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||||
|
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
|
||||||
|
default_image_std = [0.26862954, 0.26130258, 0.27577711]
|
||||||
|
ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||||
|
ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||||
|
|
||||||
|
# with proper
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
@ -96,13 +104,20 @@ if args.use_f32:
|
||||||
# output in the same directory as the model if output_dir is None
|
# output in the same directory as the model if output_dir is None
|
||||||
dir_model = args.model_dir
|
dir_model = args.model_dir
|
||||||
|
|
||||||
|
if args.clip_model_is_vision:
|
||||||
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
vocab = None
|
||||||
|
tokens = None
|
||||||
|
else:
|
||||||
|
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
||||||
vocab = json.load(f)
|
vocab = json.load(f)
|
||||||
tokens = [key for key in vocab]
|
tokens = [key for key in vocab]
|
||||||
|
|
||||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
|
if args.clip_model_is_vision:
|
||||||
|
v_hparams = config
|
||||||
|
t_hparams = None
|
||||||
|
else:
|
||||||
v_hparams = config["vision_config"]
|
v_hparams = config["vision_config"]
|
||||||
t_hparams = config["text_config"]
|
t_hparams = config["text_config"]
|
||||||
|
|
||||||
|
@ -117,9 +132,12 @@ ftype = 1
|
||||||
if args.use_f32:
|
if args.use_f32:
|
||||||
ftype = 0
|
ftype = 0
|
||||||
|
|
||||||
|
if args.clip_model_is_vision:
|
||||||
model = CLIPModel.from_pretrained(dir_model)
|
model = CLIPVisionModel.from_pretrained(dir_model)
|
||||||
processor = CLIPProcessor.from_pretrained(dir_model)
|
processor = None
|
||||||
|
else:
|
||||||
|
model = CLIPModel.from_pretrained(dir_model)
|
||||||
|
processor = CLIPProcessor.from_pretrained(dir_model)
|
||||||
|
|
||||||
fname_middle = None
|
fname_middle = None
|
||||||
has_text_encoder = True
|
has_text_encoder = True
|
||||||
|
@ -128,13 +146,13 @@ has_llava_projector = False
|
||||||
if args.text_only:
|
if args.text_only:
|
||||||
fname_middle = "text-"
|
fname_middle = "text-"
|
||||||
has_vision_encoder = False
|
has_vision_encoder = False
|
||||||
elif args.vision_only:
|
|
||||||
fname_middle = "vision-"
|
|
||||||
has_text_encoder = False
|
|
||||||
elif args.llava_projector is not None:
|
elif args.llava_projector is not None:
|
||||||
fname_middle = "mmproj-"
|
fname_middle = "mmproj-"
|
||||||
has_text_encoder = False
|
has_text_encoder = False
|
||||||
has_llava_projector = True
|
has_llava_projector = True
|
||||||
|
elif args.vision_only:
|
||||||
|
fname_middle = "vision-"
|
||||||
|
has_text_encoder = False
|
||||||
else:
|
else:
|
||||||
fname_middle = ""
|
fname_middle = ""
|
||||||
|
|
||||||
|
@ -182,8 +200,12 @@ if has_vision_encoder:
|
||||||
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
||||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
||||||
|
|
||||||
image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
|
if processor is not None:
|
||||||
image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
|
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
|
||||||
|
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
|
||||||
|
else:
|
||||||
|
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
|
||||||
|
image_std = args.image_std if args.image_std is not None else default_image_std
|
||||||
fout.add_array("clip.vision.image_mean", image_mean)
|
fout.add_array("clip.vision.image_mean", image_mean)
|
||||||
fout.add_array("clip.vision.image_std", image_std)
|
fout.add_array("clip.vision.image_std", image_std)
|
||||||
|
|
||||||
|
|
7
examples/lookahead/README.md
Normal file
7
examples/lookahead/README.md
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# llama.cpp/examples/lookahead
|
||||||
|
|
||||||
|
Demonstartion of lookahead decoding technique:
|
||||||
|
|
||||||
|
https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||||
|
|
||||||
|
More info: https://github.com/ggerganov/llama.cpp/pull/4207
|
|
@ -100,6 +100,12 @@ static void sigint_handler(int signo) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||||
|
(void) level;
|
||||||
|
(void) user_data;
|
||||||
|
LOG_TEE("%s", text);
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
@ -113,6 +119,7 @@ int main(int argc, char ** argv) {
|
||||||
log_set_target(log_filename_generator("main", "log"));
|
log_set_target(log_filename_generator("main", "log"));
|
||||||
LOG_TEE("Log start\n");
|
LOG_TEE("Log start\n");
|
||||||
log_dump_cmdline(argc, argv);
|
log_dump_cmdline(argc, argv);
|
||||||
|
llama_log_set(llama_log_callback_logTee, nullptr);
|
||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
// TODO: Dump params ?
|
// TODO: Dump params ?
|
||||||
|
|
|
@ -11,10 +11,10 @@ app = Flask(__name__)
|
||||||
slot_id = -1
|
slot_id = -1
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
|
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
|
||||||
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
|
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')
|
||||||
parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
|
parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ")
|
||||||
parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
|
parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ")
|
||||||
parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
|
parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ")
|
||||||
parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
|
parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
|
||||||
parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
|
parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
|
||||||
parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
|
parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
|
||||||
|
@ -34,19 +34,19 @@ def is_present(json, key):
|
||||||
|
|
||||||
#convert chat to prompt
|
#convert chat to prompt
|
||||||
def convert_chat(messages):
|
def convert_chat(messages):
|
||||||
prompt = "" + args.chat_prompt.replace("\\n", "\n")
|
|
||||||
|
|
||||||
system_n = args.system_name.replace("\\n", "\n")
|
system_n = args.system_name
|
||||||
user_n = args.user_name.replace("\\n", "\n")
|
user_n = args.user_name
|
||||||
ai_n = args.ai_name.replace("\\n", "\n")
|
ai_n = args.ai_name
|
||||||
stop = args.stop.replace("\\n", "\n")
|
stop = args.stop
|
||||||
|
|
||||||
|
prompt = "" + args.chat_prompt + stop
|
||||||
|
|
||||||
for line in messages:
|
for line in messages:
|
||||||
if (line["role"] == "system"):
|
if (line["role"] == "system"):
|
||||||
prompt += f"{system_n}{line['content']}"
|
prompt += f"{system_n}{line['content']}{stop}"
|
||||||
if (line["role"] == "user"):
|
if (line["role"] == "user"):
|
||||||
prompt += f"{user_n}{line['content']}"
|
prompt += f"{user_n}{line['content']}{stop}"
|
||||||
if (line["role"] == "assistant"):
|
if (line["role"] == "assistant"):
|
||||||
prompt += f"{ai_n}{line['content']}{stop}"
|
prompt += f"{ai_n}{line['content']}{stop}"
|
||||||
prompt += ai_n.rstrip()
|
prompt += ai_n.rstrip()
|
||||||
|
@ -130,7 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
slot_id = data["slot_id"]
|
slot_id = data.get("slot_id")
|
||||||
if (chat):
|
if (chat):
|
||||||
if (start):
|
if (start):
|
||||||
resData["choices"][0]["delta"] = {
|
resData["choices"][0]["delta"] = {
|
||||||
|
@ -150,11 +150,13 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
|
||||||
return resData
|
return resData
|
||||||
|
|
||||||
|
|
||||||
@app.route('/chat/completions', methods=['POST'])
|
@app.route('/chat/completions', methods=['POST', 'OPTIONS'])
|
||||||
@app.route('/v1/chat/completions', methods=['POST'])
|
@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS'])
|
||||||
def chat_completions():
|
def chat_completions():
|
||||||
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
||||||
return Response(status=403)
|
return Response(status=403)
|
||||||
|
if request.method == 'OPTIONS':
|
||||||
|
return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||||
body = request.get_json()
|
body = request.get_json()
|
||||||
stream = False
|
stream = False
|
||||||
tokenize = False
|
tokenize = False
|
||||||
|
@ -177,20 +179,22 @@ def chat_completions():
|
||||||
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
|
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
|
||||||
time_now = int(time.time())
|
time_now = int(time.time())
|
||||||
resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
|
resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
|
||||||
yield 'data: {}\n'.format(json.dumps(resData))
|
yield 'data: {}\n\n'.format(json.dumps(resData))
|
||||||
for line in data.iter_lines():
|
for line in data.iter_lines():
|
||||||
if line:
|
if line:
|
||||||
decoded_line = line.decode('utf-8')
|
decoded_line = line.decode('utf-8')
|
||||||
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
|
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
|
||||||
yield 'data: {}\n'.format(json.dumps(resData))
|
yield 'data: {}\n\n'.format(json.dumps(resData))
|
||||||
return Response(generate(), mimetype='text/event-stream')
|
return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||||
|
|
||||||
|
|
||||||
@app.route('/completions', methods=['POST'])
|
@app.route('/completions', methods=['POST', 'OPTIONS'])
|
||||||
@app.route('/v1/completions', methods=['POST'])
|
@app.route('/v1/completions', methods=['POST', 'OPTIONS'])
|
||||||
def completion():
|
def completion():
|
||||||
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
||||||
return Response(status=403)
|
return Response(status=403)
|
||||||
|
if request.method == 'OPTIONS':
|
||||||
|
return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||||
body = request.get_json()
|
body = request.get_json()
|
||||||
stream = False
|
stream = False
|
||||||
tokenize = False
|
tokenize = False
|
||||||
|
@ -216,8 +220,8 @@ def completion():
|
||||||
if line:
|
if line:
|
||||||
decoded_line = line.decode('utf-8')
|
decoded_line = line.decode('utf-8')
|
||||||
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
|
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
|
||||||
yield 'data: {}\n'.format(json.dumps(resData))
|
yield 'data: {}\n\n'.format(json.dumps(resData))
|
||||||
return Response(generate(), mimetype='text/event-stream')
|
return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(args.host, port=args.port)
|
app.run(args.host, port=args.port)
|
||||||
|
|
|
@ -155,15 +155,23 @@ struct task_server {
|
||||||
json data;
|
json data;
|
||||||
bool infill_mode = false;
|
bool infill_mode = false;
|
||||||
bool embedding_mode = false;
|
bool embedding_mode = false;
|
||||||
|
int multitask_id = -1;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct task_result {
|
struct task_result {
|
||||||
int id;
|
int id;
|
||||||
|
int multitask_id = -1;
|
||||||
bool stop;
|
bool stop;
|
||||||
bool error;
|
bool error;
|
||||||
json result_json;
|
json result_json;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct task_multi {
|
||||||
|
int id;
|
||||||
|
std::set<int> subtasks_remaining{};
|
||||||
|
std::vector<task_result> results{};
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: can become bool if we can't find use of more states
|
// TODO: can become bool if we can't find use of more states
|
||||||
enum slot_state
|
enum slot_state
|
||||||
{
|
{
|
||||||
|
@ -406,6 +414,9 @@ struct llama_client_slot
|
||||||
double t_prompt_processing; // ms
|
double t_prompt_processing; // ms
|
||||||
double t_token_generation; // ms
|
double t_token_generation; // ms
|
||||||
|
|
||||||
|
// multitasks
|
||||||
|
int multitask_id = -1;
|
||||||
|
|
||||||
void reset() {
|
void reset() {
|
||||||
num_prompt_tokens = 0;
|
num_prompt_tokens = 0;
|
||||||
generated_text = "";
|
generated_text = "";
|
||||||
|
@ -529,7 +540,8 @@ struct llama_server_context
|
||||||
|
|
||||||
std::vector<task_server> queue_tasks;
|
std::vector<task_server> queue_tasks;
|
||||||
std::vector<task_result> queue_results;
|
std::vector<task_result> queue_results;
|
||||||
std::mutex mutex_tasks;
|
std::vector<task_multi> queue_multitasks;
|
||||||
|
std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
|
||||||
std::mutex mutex_results;
|
std::mutex mutex_results;
|
||||||
|
|
||||||
~llama_server_context()
|
~llama_server_context()
|
||||||
|
@ -1112,17 +1124,40 @@ struct llama_server_context
|
||||||
return slot.images.size() > 0;
|
return slot.images.size() > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_error(int id, std::string error)
|
void send_error(task_server& task, std::string error)
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(mutex_results);
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = id;
|
res.id = task.id;
|
||||||
|
res.multitask_id = task.multitask_id;
|
||||||
res.stop = false;
|
res.stop = false;
|
||||||
res.error = true;
|
res.error = true;
|
||||||
res.result_json = { { "content", error } };
|
res.result_json = { { "content", error } };
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void add_multi_task(int id, std::vector<int>& sub_ids)
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
|
task_multi multi;
|
||||||
|
multi.id = id;
|
||||||
|
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
|
||||||
|
queue_multitasks.push_back(multi);
|
||||||
|
}
|
||||||
|
|
||||||
|
void update_multi_task(int multitask_id, int subtask_id, task_result& result)
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
|
for (auto& multitask : queue_multitasks)
|
||||||
|
{
|
||||||
|
if (multitask.id == multitask_id)
|
||||||
|
{
|
||||||
|
multitask.subtasks_remaining.erase(subtask_id);
|
||||||
|
multitask.results.push_back(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
json get_model_props()
|
json get_model_props()
|
||||||
{
|
{
|
||||||
return get_formated_generation(slots[0]);
|
return get_formated_generation(slots[0]);
|
||||||
|
@ -1167,6 +1202,7 @@ struct llama_server_context
|
||||||
std::lock_guard<std::mutex> lock(mutex_results);
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = slot.task_id;
|
res.id = slot.task_id;
|
||||||
|
res.multitask_id = slot.multitask_id;
|
||||||
res.error = false;
|
res.error = false;
|
||||||
res.stop = false;
|
res.stop = false;
|
||||||
|
|
||||||
|
@ -1206,6 +1242,7 @@ struct llama_server_context
|
||||||
std::lock_guard<std::mutex> lock(mutex_results);
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = slot.task_id;
|
res.id = slot.task_id;
|
||||||
|
res.multitask_id = slot.multitask_id;
|
||||||
res.error = false;
|
res.error = false;
|
||||||
res.stop = true;
|
res.stop = true;
|
||||||
|
|
||||||
|
@ -1251,6 +1288,12 @@ struct llama_server_context
|
||||||
res.result_json["model"] = slot.oaicompat_model;
|
res.result_json["model"] = slot.oaicompat_model;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parent multitask, if any, needs to be updated
|
||||||
|
if (slot.multitask_id != -1)
|
||||||
|
{
|
||||||
|
update_multi_task(slot.multitask_id, slot.task_id, res);
|
||||||
|
}
|
||||||
|
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1259,6 +1302,7 @@ struct llama_server_context
|
||||||
std::lock_guard<std::mutex> lock(mutex_results);
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = slot.task_id;
|
res.id = slot.task_id;
|
||||||
|
res.multitask_id = slot.multitask_id;
|
||||||
res.error = false;
|
res.error = false;
|
||||||
res.stop = true;
|
res.stop = true;
|
||||||
|
|
||||||
|
@ -1285,9 +1329,9 @@ struct llama_server_context
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
int request_completion(json data, bool infill, bool embedding)
|
int request_completion(json data, bool infill, bool embedding, int multitask_id)
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||||
task_server task;
|
task_server task;
|
||||||
task.id = id_gen++;
|
task.id = id_gen++;
|
||||||
task.target_id = 0;
|
task.target_id = 0;
|
||||||
|
@ -1295,6 +1339,16 @@ struct llama_server_context
|
||||||
task.infill_mode = infill;
|
task.infill_mode = infill;
|
||||||
task.embedding_mode = embedding;
|
task.embedding_mode = embedding;
|
||||||
task.type = COMPLETION_TASK;
|
task.type = COMPLETION_TASK;
|
||||||
|
task.multitask_id = multitask_id;
|
||||||
|
|
||||||
|
// when a completion task's prompt array is not a singleton, we split it into multiple requests
|
||||||
|
if (task.data.at("prompt").size() > 1)
|
||||||
|
{
|
||||||
|
lock.unlock(); // entering new func scope
|
||||||
|
return split_multiprompt_task(task);
|
||||||
|
}
|
||||||
|
|
||||||
|
// otherwise, it's a single-prompt task, we actually queue it
|
||||||
queue_tasks.push_back(task);
|
queue_tasks.push_back(task);
|
||||||
return task.id;
|
return task.id;
|
||||||
}
|
}
|
||||||
|
@ -1313,8 +1367,17 @@ struct llama_server_context
|
||||||
|
|
||||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||||
{
|
{
|
||||||
|
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
||||||
|
if (queue_results[i].multitask_id == task_id)
|
||||||
|
{
|
||||||
|
update_multi_task(task_id, queue_results[i].id, queue_results[i]);
|
||||||
|
queue_results.erase(queue_results.begin() + i);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (queue_results[i].id == task_id)
|
if (queue_results[i].id == task_id)
|
||||||
{
|
{
|
||||||
|
assert(queue_results[i].multitask_id == -1);
|
||||||
task_result res = queue_results[i];
|
task_result res = queue_results[i];
|
||||||
queue_results.erase(queue_results.begin() + i);
|
queue_results.erase(queue_results.begin() + i);
|
||||||
return res;
|
return res;
|
||||||
|
@ -1404,6 +1467,27 @@ struct llama_server_context
|
||||||
queue_tasks.push_back(task);
|
queue_tasks.push_back(task);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int split_multiprompt_task(task_server& multiprompt_task)
|
||||||
|
{
|
||||||
|
int prompt_count = multiprompt_task.data.at("prompt").size();
|
||||||
|
assert(prompt_count > 1);
|
||||||
|
|
||||||
|
int multitask_id = id_gen++;
|
||||||
|
std::vector<int> subtask_ids(prompt_count);
|
||||||
|
for (int i = 0; i < prompt_count; i++)
|
||||||
|
{
|
||||||
|
json subtask_data = multiprompt_task.data;
|
||||||
|
subtask_data["prompt"] = subtask_data["prompt"][i];
|
||||||
|
|
||||||
|
// subtasks inherit everything else (infill mode, embedding mode, etc.)
|
||||||
|
subtask_ids[i] = request_completion(subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// queue up the multitask so we can track its subtask progression
|
||||||
|
add_multi_task(multitask_id, subtask_ids);
|
||||||
|
return multitask_id;
|
||||||
|
}
|
||||||
|
|
||||||
void process_tasks()
|
void process_tasks()
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
|
@ -1419,7 +1503,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
LOG_TEE("slot unavailable\n");
|
LOG_TEE("slot unavailable\n");
|
||||||
// send error result
|
// send error result
|
||||||
send_error(task.id, "slot unavailable");
|
send_error(task, "slot unavailable");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1433,11 +1517,12 @@ struct llama_server_context
|
||||||
slot->infill = task.infill_mode;
|
slot->infill = task.infill_mode;
|
||||||
slot->embedding = task.embedding_mode;
|
slot->embedding = task.embedding_mode;
|
||||||
slot->task_id = task.id;
|
slot->task_id = task.id;
|
||||||
|
slot->multitask_id = task.multitask_id;
|
||||||
|
|
||||||
if (!launch_slot_with_data(slot, task.data))
|
if (!launch_slot_with_data(slot, task.data))
|
||||||
{
|
{
|
||||||
// send error result
|
// send error result
|
||||||
send_error(task.id, "internal_error");
|
send_error(task, "internal_error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -1453,6 +1538,38 @@ struct llama_server_context
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
|
||||||
|
auto queue_iterator = queue_multitasks.begin();
|
||||||
|
while (queue_iterator != queue_multitasks.end())
|
||||||
|
{
|
||||||
|
if (queue_iterator->subtasks_remaining.empty())
|
||||||
|
{
|
||||||
|
// all subtasks done == multitask is done
|
||||||
|
task_result aggregate_result;
|
||||||
|
aggregate_result.id = queue_iterator->id;
|
||||||
|
aggregate_result.stop = true;
|
||||||
|
aggregate_result.error = false;
|
||||||
|
|
||||||
|
// collect json results into one json result
|
||||||
|
std::vector<json> result_jsons;
|
||||||
|
for (auto& subres : queue_iterator->results)
|
||||||
|
{
|
||||||
|
result_jsons.push_back(subres.result_json);
|
||||||
|
aggregate_result.error = aggregate_result.error && subres.error;
|
||||||
|
}
|
||||||
|
aggregate_result.result_json = json{ "results", result_jsons };
|
||||||
|
|
||||||
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
|
queue_results.push_back(aggregate_result);
|
||||||
|
|
||||||
|
queue_iterator = queue_multitasks.erase(queue_iterator);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
++queue_iterator;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool update_slots() {
|
bool update_slots() {
|
||||||
|
@ -1844,6 +1961,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||||
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||||
|
printf(" --log-disable disables logging to a file.\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2198,6 +2316,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.mmproj = argv[i];
|
params.mmproj = argv[i];
|
||||||
}
|
}
|
||||||
|
else if (arg == "--log-disable")
|
||||||
|
{
|
||||||
|
log_set_target(stdout);
|
||||||
|
LOG_INFO("logging to file is disabled.", {});
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
|
@ -2596,7 +2719,7 @@ int main(int argc, char **argv)
|
||||||
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, false, false);
|
const int task_id = llama.request_completion(data, false, false, -1);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
std::string completion_text;
|
std::string completion_text;
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
|
@ -2685,7 +2808,7 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
||||||
|
|
||||||
const int task_id = llama.request_completion(data, false, false);
|
const int task_id = llama.request_completion(data, false, false, -1);
|
||||||
|
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
std::string completion_text;
|
std::string completion_text;
|
||||||
|
@ -2754,7 +2877,7 @@ int main(int argc, char **argv)
|
||||||
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
const int task_id = llama.request_completion(data, true, false);
|
const int task_id = llama.request_completion(data, true, false, -1);
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
std::string completion_text;
|
std::string completion_text;
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
|
@ -2858,7 +2981,7 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
prompt = "";
|
prompt = "";
|
||||||
}
|
}
|
||||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
|
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
return res.set_content(result.result_json.dump(), "application/json");
|
return res.set_content(result.result_json.dump(), "application/json");
|
||||||
});
|
});
|
||||||
|
|
8
examples/speculative/README.md
Normal file
8
examples/speculative/README.md
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
# llama.cpp/examples/speculative
|
||||||
|
|
||||||
|
Demonstartion of speculative decoding and tree-based speculative decoding techniques
|
||||||
|
|
||||||
|
More info:
|
||||||
|
|
||||||
|
- https://github.com/ggerganov/llama.cpp/pull/2926
|
||||||
|
- https://github.com/ggerganov/llama.cpp/pull/3624
|
|
@ -137,7 +137,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
add_allocated_tensor(alloc, tensor);
|
add_allocated_tensor(alloc, tensor);
|
||||||
size_t cur_max = (char*)addr - (char*)alloc->data + size;
|
size_t cur_max = (char*)addr - (char*)alloc->base + size;
|
||||||
if (cur_max > alloc->max_size) {
|
if (cur_max > alloc->max_size) {
|
||||||
printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
|
|
130
ggml-cuda.cu
130
ggml-cuda.cu
|
@ -443,6 +443,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
||||||
#define CUDA_SCALE_BLOCK_SIZE 256
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
||||||
#define CUDA_CLAMP_BLOCK_SIZE 256
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
||||||
#define CUDA_ROPE_BLOCK_SIZE 256
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
||||||
|
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
|
||||||
#define CUDA_ALIBI_BLOCK_SIZE 32
|
#define CUDA_ALIBI_BLOCK_SIZE 32
|
||||||
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
||||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||||
|
@ -501,6 +502,31 @@ static size_t g_scratch_offset = 0;
|
||||||
|
|
||||||
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
||||||
|
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
|
||||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
@ -577,15 +603,6 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
||||||
dst[i] = x[i] * x[i];
|
dst[i] = x[i] * x[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
|
||||||
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <int block_size>
|
template <int block_size>
|
||||||
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
|
@ -624,14 +641,6 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <int block_size>
|
template <int block_size>
|
||||||
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
|
@ -4717,45 +4726,74 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
||||||
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
|
||||||
}
|
}
|
||||||
|
|
||||||
// the CUDA soft max implementation differs from the CPU implementation
|
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale) {
|
||||||
// instead of doubles floats are used
|
const int tid = threadIdx.x;
|
||||||
static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) {
|
const int rowx = blockIdx.x;
|
||||||
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
|
||||||
const int block_size = blockDim.y;
|
|
||||||
const int tid = threadIdx.y;
|
const int block_size = blockDim.x;
|
||||||
|
|
||||||
|
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||||
|
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||||
|
|
||||||
|
__shared__ float buf[CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE];
|
||||||
|
|
||||||
float max_val = -INFINITY;
|
float max_val = -INFINITY;
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += block_size) {
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
const int i = row*ncols + col;
|
const int ix = rowx*ncols + col;
|
||||||
max_val = max(max_val, x[i]);
|
const int iy = rowy*ncols + col;
|
||||||
|
max_val = max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f));
|
||||||
}
|
}
|
||||||
|
|
||||||
// find the max value in the block
|
// find the max value in the block
|
||||||
#pragma unroll
|
max_val = warp_reduce_max(max_val);
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
if (block_size > WARP_SIZE) {
|
||||||
max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32));
|
if (warp_id == 0) {
|
||||||
|
buf[lane_id] = -INFINITY;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (lane_id == 0) {
|
||||||
|
buf[warp_id] = max_val;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
max_val = buf[lane_id];
|
||||||
|
max_val = warp_reduce_max(max_val);
|
||||||
}
|
}
|
||||||
|
|
||||||
float tmp = 0.f;
|
float tmp = 0.f;
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += block_size) {
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
const int i = row*ncols + col;
|
const int ix = rowx*ncols + col;
|
||||||
const float val = expf(x[i] - max_val);
|
const int iy = rowy*ncols + col;
|
||||||
|
const float val = expf((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val);
|
||||||
tmp += val;
|
tmp += val;
|
||||||
dst[i] = val;
|
dst[ix] = val;
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums
|
// find the sum of exps in the block
|
||||||
#pragma unroll
|
tmp = warp_reduce_sum(tmp);
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
if (block_size > WARP_SIZE) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
if (warp_id == 0) {
|
||||||
|
buf[lane_id] = 0.f;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (lane_id == 0) {
|
||||||
|
buf[warp_id] = tmp;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
tmp = buf[lane_id];
|
||||||
|
tmp = warp_reduce_sum(tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
const float inv_tmp = 1.f / tmp;
|
const float inv_tmp = 1.f / tmp;
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += block_size) {
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
const int i = row*ncols + col;
|
const int i = rowx*ncols + col;
|
||||||
dst[i] *= inv_tmp;
|
dst[i] *= inv_tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5792,10 +5830,12 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
||||||
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) {
|
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
||||||
const dim3 block_dims(1, WARP_SIZE, 1);
|
int nth = WARP_SIZE;
|
||||||
|
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||||
|
const dim3 block_dims(nth, 1, 1);
|
||||||
const dim3 block_nums(nrows_x, 1, 1);
|
const dim3 block_nums(nrows_x, 1, 1);
|
||||||
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
|
soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
static void im2col_f32_f16_cuda(const float * x, half * dst,
|
||||||
|
@ -6846,14 +6886,18 @@ inline void ggml_cuda_op_soft_max(
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
const int64_t nrows_x = ggml_nrows(src0);
|
||||||
|
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
||||||
|
|
||||||
soft_max_f32_cuda(src0_dd, dst_dd, ne00, nrows, main_stream);
|
float scale = 1.0f;
|
||||||
|
memcpy(&scale, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
|
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||||
|
|
||||||
(void) src1;
|
|
||||||
(void) dst;
|
(void) dst;
|
||||||
(void) src1_dd;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_cuda_op_scale(
|
inline void ggml_cuda_op_scale(
|
||||||
|
|
31
ggml-metal.m
31
ggml-metal.m
|
@ -1028,20 +1028,27 @@ void ggml_metal_graph_compute(
|
||||||
int nth = 32; // SIMD width
|
int nth = 32; // SIMD width
|
||||||
|
|
||||||
if (ne00%4 == 0) {
|
if (ne00%4 == 0) {
|
||||||
|
while (nth < ne00/4 && nth < 256) {
|
||||||
|
nth *= 2;
|
||||||
|
}
|
||||||
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
|
||||||
} else {
|
} else {
|
||||||
do {
|
while (nth < ne00 && nth < 1024) {
|
||||||
nth *= 2;
|
nth *= 2;
|
||||||
} while (nth <= ne00 && nth <= 1024);
|
}
|
||||||
nth /= 2;
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
[encoder setComputePipelineState:ctx->pipeline_soft_max];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const float scale = ((float *) dst->op_params)[0];
|
||||||
|
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||||
[encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||||
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:6];
|
||||||
|
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
@ -1351,7 +1358,11 @@ void ggml_metal_graph_compute(
|
||||||
float eps;
|
float eps;
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
const int nth = MIN(512, ne00);
|
int nth = 32; // SIMD width
|
||||||
|
|
||||||
|
while (nth < ne00/4 && nth < 1024) {
|
||||||
|
nth *= 2;
|
||||||
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
@ -1359,7 +1370,7 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||||
[encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
|
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
||||||
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
|
148
ggml-metal.metal
148
ggml-metal.metal
|
@ -39,6 +39,8 @@ typedef struct {
|
||||||
int8_t qs[QK8_0]; // quants
|
int8_t qs[QK8_0]; // quants
|
||||||
} block_q8_0;
|
} block_q8_0;
|
||||||
|
|
||||||
|
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||||
|
|
||||||
// general-purpose kernel for addition of two tensors
|
// general-purpose kernel for addition of two tensors
|
||||||
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
||||||
// cons: not very efficient
|
// cons: not very efficient
|
||||||
|
@ -180,10 +182,12 @@ kernel void kernel_gelu(
|
||||||
|
|
||||||
kernel void kernel_soft_max(
|
kernel void kernel_soft_max(
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
|
constant float & scale,
|
||||||
threadgroup float * buf [[threadgroup(0)]],
|
threadgroup float * buf [[threadgroup(0)]],
|
||||||
uint tgpig[[threadgroup_position_in_grid]],
|
uint tgpig[[threadgroup_position_in_grid]],
|
||||||
uint tpitg[[thread_position_in_threadgroup]],
|
uint tpitg[[thread_position_in_threadgroup]],
|
||||||
|
@ -195,72 +199,76 @@ kernel void kernel_soft_max(
|
||||||
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
||||||
|
|
||||||
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
|
device const float * pmask = src1 ? src1 + i01*ne00 : nullptr;
|
||||||
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||||
|
|
||||||
// parallel max
|
// parallel max
|
||||||
float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
|
float lmax = -INFINITY;
|
||||||
|
|
||||||
for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
lmax = MAX(lmax, psrc0[i00]);
|
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
||||||
}
|
}
|
||||||
|
|
||||||
float max = simd_max(lmax);
|
// find the max value in the block
|
||||||
|
float max_val = simd_max(lmax);
|
||||||
|
if (ntg > N_SIMDWIDTH) {
|
||||||
|
if (sgitg == 0) {
|
||||||
|
buf[tiisg] = -INFINITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
if (tiisg == 0) {
|
if (tiisg == 0) {
|
||||||
buf[sgitg] = max;
|
buf[sgitg] = max_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
// broadcast, simd group number is ntg / 32
|
max_val = buf[tiisg];
|
||||||
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
max_val = simd_max(max_val);
|
||||||
if (tpitg < i) {
|
|
||||||
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
|
|
||||||
max = buf[0];
|
|
||||||
|
|
||||||
// parallel sum
|
// parallel sum
|
||||||
float lsum = 0.0f;
|
float lsum = 0.0f;
|
||||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
const float exp_psrc0 = exp(psrc0[i00] - max);
|
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
||||||
lsum += exp_psrc0;
|
lsum += exp_psrc0;
|
||||||
// Remember the result of exp here. exp is expensive, so we really do not
|
|
||||||
// wish to compute it twice.
|
|
||||||
pdst[i00] = exp_psrc0;
|
pdst[i00] = exp_psrc0;
|
||||||
}
|
}
|
||||||
|
|
||||||
float sum = simd_sum(lsum);
|
float sum = simd_sum(lsum);
|
||||||
|
if (ntg > N_SIMDWIDTH) {
|
||||||
|
if (sgitg == 0) {
|
||||||
|
buf[tiisg] = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
if (tiisg == 0) {
|
if (tiisg == 0) {
|
||||||
buf[sgitg] = sum;
|
buf[sgitg] = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
// broadcast, simd group number is ntg / 32
|
sum = buf[tiisg];
|
||||||
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
sum = simd_sum(sum);
|
||||||
if (tpitg < i) {
|
|
||||||
buf[tpitg] += buf[tpitg + i];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
const float inv_sum = 1.0f/sum;
|
||||||
|
|
||||||
sum = buf[0];
|
|
||||||
|
|
||||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
pdst[i00] /= sum;
|
pdst[i00] *= inv_sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_soft_max_4(
|
kernel void kernel_soft_max_4(
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
|
device const float * src1,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
|
constant float & scale,
|
||||||
threadgroup float * buf [[threadgroup(0)]],
|
threadgroup float * buf [[threadgroup(0)]],
|
||||||
uint tgpig[[threadgroup_position_in_grid]],
|
uint tgpig[[threadgroup_position_in_grid]],
|
||||||
uint tpitg[[thread_position_in_threadgroup]],
|
uint tpitg[[thread_position_in_threadgroup]],
|
||||||
|
@ -272,63 +280,67 @@ kernel void kernel_soft_max_4(
|
||||||
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
||||||
|
|
||||||
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
|
device const float4 * pmask = src1 ? (device const float4 *)(src1 + i01*ne00) : nullptr;
|
||||||
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
|
|
||||||
// parallel max
|
// parallel max
|
||||||
float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
|
float4 lmax4 = -INFINITY;
|
||||||
|
|
||||||
for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
lmax4 = fmax(lmax4, psrc4[i00]);
|
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
||||||
}
|
}
|
||||||
|
|
||||||
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
||||||
float max = simd_max(lmax);
|
|
||||||
|
float max_val = simd_max(lmax);
|
||||||
|
if (ntg > N_SIMDWIDTH) {
|
||||||
|
if (sgitg == 0) {
|
||||||
|
buf[tiisg] = -INFINITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
if (tiisg == 0) {
|
if (tiisg == 0) {
|
||||||
buf[sgitg] = max;
|
buf[sgitg] = max_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
// broadcast, simd group number is ntg / 32
|
max_val = buf[tiisg];
|
||||||
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
max_val = simd_max(max_val);
|
||||||
if (tpitg < i) {
|
|
||||||
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
|
|
||||||
max = buf[0];
|
|
||||||
|
|
||||||
// parallel sum
|
// parallel sum
|
||||||
float4 lsum4 = 0.0f;
|
float4 lsum4 = 0.0f;
|
||||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
const float4 exp_psrc4 = exp(psrc4[i00] - max);
|
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
||||||
lsum4 += exp_psrc4;
|
lsum4 += exp_psrc4;
|
||||||
pdst4[i00] = exp_psrc4;
|
pdst4[i00] = exp_psrc4;
|
||||||
}
|
}
|
||||||
|
|
||||||
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
||||||
float sum = simd_sum(lsum);
|
float sum = simd_sum(lsum);
|
||||||
|
if (ntg > N_SIMDWIDTH) {
|
||||||
|
if (sgitg == 0) {
|
||||||
|
buf[tiisg] = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
if (tiisg == 0) {
|
if (tiisg == 0) {
|
||||||
buf[sgitg] = sum;
|
buf[sgitg] = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
// broadcast, simd group number is ntg / 32
|
sum = buf[tiisg];
|
||||||
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
sum = simd_sum(sum);
|
||||||
if (tpitg < i) {
|
|
||||||
buf[tpitg] += buf[tpitg + i];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
const float inv_sum = 1.0f/sum;
|
||||||
|
|
||||||
sum = buf[0];
|
|
||||||
|
|
||||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
pdst4[i00] /= sum;
|
pdst4[i00] *= inv_sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -435,14 +447,13 @@ kernel void kernel_rms_norm(
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant uint64_t & nb01,
|
constant uint64_t & nb01,
|
||||||
constant float & eps,
|
constant float & eps,
|
||||||
threadgroup float * sum [[threadgroup(0)]],
|
threadgroup float * buf [[threadgroup(0)]],
|
||||||
uint tgpig[[threadgroup_position_in_grid]],
|
uint tgpig[[threadgroup_position_in_grid]],
|
||||||
uint tpitg[[thread_position_in_threadgroup]],
|
uint tpitg[[thread_position_in_threadgroup]],
|
||||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||||
uint tiisg[[thread_index_in_simdgroup]],
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
uint ntg[[threads_per_threadgroup]]) {
|
uint ntg[[threads_per_threadgroup]]) {
|
||||||
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
||||||
device const float * x_scalar = (device const float *) x;
|
|
||||||
|
|
||||||
float4 sumf = 0;
|
float4 sumf = 0;
|
||||||
float all_sum = 0;
|
float all_sum = 0;
|
||||||
|
@ -453,40 +464,30 @@ kernel void kernel_rms_norm(
|
||||||
}
|
}
|
||||||
all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
|
all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
|
||||||
all_sum = simd_sum(all_sum);
|
all_sum = simd_sum(all_sum);
|
||||||
|
if (ntg > N_SIMDWIDTH) {
|
||||||
|
if (sgitg == 0) {
|
||||||
|
buf[tiisg] = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
if (tiisg == 0) {
|
if (tiisg == 0) {
|
||||||
sum[sgitg] = all_sum;
|
buf[sgitg] = all_sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
// broadcast, simd group number is ntg / 32
|
all_sum = buf[tiisg];
|
||||||
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
all_sum = simd_sum(all_sum);
|
||||||
if (tpitg < i) {
|
|
||||||
sum[tpitg] += sum[tpitg + i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (tpitg == 0) {
|
|
||||||
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
|
||||||
sum[0] += x_scalar[i];
|
|
||||||
}
|
|
||||||
sum[0] /= ne00;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
const float mean = all_sum/ne00;
|
||||||
|
|
||||||
const float mean = sum[0];
|
|
||||||
const float scale = 1.0f/sqrt(mean + eps);
|
const float scale = 1.0f/sqrt(mean + eps);
|
||||||
|
|
||||||
device float4 * y = (device float4 *) (dst + tgpig*ne00);
|
device float4 * y = (device float4 *) (dst + tgpig*ne00);
|
||||||
device float * y_scalar = (device float *) y;
|
|
||||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
y[i00] = x[i00] * scale;
|
y[i00] = x[i00] * scale;
|
||||||
}
|
}
|
||||||
if (tpitg == 0) {
|
|
||||||
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
|
||||||
y_scalar[i00] = x_scalar[i00] * scale;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
|
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
|
||||||
|
@ -576,7 +577,6 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre
|
||||||
// putting them in the kernel cause a significant performance penalty
|
// putting them in the kernel cause a significant performance penalty
|
||||||
#define N_DST 4 // each SIMD group works on 4 rows
|
#define N_DST 4 // each SIMD group works on 4 rows
|
||||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
|
||||||
//Note: This is a template, but strictly speaking it only applies to
|
//Note: This is a template, but strictly speaking it only applies to
|
||||||
// quantizations where the block size is 32. It also does not
|
// quantizations where the block size is 32. It also does not
|
||||||
// giard against the number of rows not being divisible by
|
// giard against the number of rows not being divisible by
|
||||||
|
|
|
@ -1,20 +1,18 @@
|
||||||
|
#include "ggml.h"
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <limits>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <limits>
|
|
||||||
|
|
||||||
#define CL_TARGET_OPENCL_VERSION 110
|
#define CL_TARGET_OPENCL_VERSION 110
|
||||||
#include <clblast.h>
|
#include <clblast.h>
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
73
ggml.c
73
ggml.c
|
@ -4826,7 +4826,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
||||||
static struct ggml_tensor * ggml_soft_max_impl(
|
static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * mask,
|
||||||
|
float scale,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(a));
|
||||||
|
if (mask) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
||||||
|
GGML_ASSERT(mask->ne[2] == 1);
|
||||||
|
GGML_ASSERT(mask->ne[3] == 1);
|
||||||
|
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
|
||||||
|
}
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (a->grad) {
|
if (a->grad) {
|
||||||
|
@ -4835,9 +4845,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
float params[] = { scale };
|
||||||
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
result->op = GGML_OP_SOFT_MAX;
|
result->op = GGML_OP_SOFT_MAX;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
result->src[0] = a;
|
result->src[0] = a;
|
||||||
|
result->src[1] = mask;
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -4845,13 +4859,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
struct ggml_tensor * ggml_soft_max(
|
struct ggml_tensor * ggml_soft_max(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a) {
|
||||||
return ggml_soft_max_impl(ctx, a, false);
|
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_soft_max_inplace(
|
struct ggml_tensor * ggml_soft_max_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a) {
|
||||||
return ggml_soft_max_impl(ctx, a, true);
|
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_soft_max_ext(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * mask,
|
||||||
|
float scale) {
|
||||||
|
return ggml_soft_max_impl(ctx, a, mask, scale, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_soft_max_back
|
// ggml_soft_max_back
|
||||||
|
@ -10551,20 +10573,25 @@ static void ggml_compute_forward_diag_mask_zero(
|
||||||
static void ggml_compute_forward_soft_max_f32(
|
static void ggml_compute_forward_soft_max_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
assert(ggml_is_contiguous(dst));
|
||||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float scale = 1.0f;
|
||||||
|
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||||
|
|
||||||
// TODO: handle transposed/permuted matrices
|
// TODO: handle transposed/permuted matrices
|
||||||
|
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
||||||
|
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
const int nr = ggml_nrows(src0);
|
const int nr = ggml_nrows(src0);
|
||||||
|
|
||||||
|
@ -10575,29 +10602,40 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
const int ir0 = dr*ith;
|
const int ir0 = dr*ith;
|
||||||
const int ir1 = MIN(ir0 + dr, nr);
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
|
|
||||||
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
||||||
|
|
||||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||||
float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
||||||
float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
||||||
|
|
||||||
|
// broadcast the mask across rows
|
||||||
|
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
||||||
|
|
||||||
|
ggml_vec_cpy_f32 (nc, wp, sp);
|
||||||
|
ggml_vec_scale_f32(nc, wp, scale);
|
||||||
|
if (mp) {
|
||||||
|
ggml_vec_acc_f32(nc, wp, mp);
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
for (int i = 0; i < nc; ++i) {
|
for (int i = 0; i < nc; ++i) {
|
||||||
//printf("p[%d] = %f\n", i, p[i]);
|
//printf("p[%d] = %f\n", i, p[i]);
|
||||||
assert(!isnan(sp[i]));
|
assert(!isnan(wp[i]));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
float max = -INFINITY;
|
float max = -INFINITY;
|
||||||
ggml_vec_max_f32(nc, &max, sp);
|
ggml_vec_max_f32(nc, &max, wp);
|
||||||
|
|
||||||
ggml_float sum = 0.0;
|
ggml_float sum = 0.0;
|
||||||
|
|
||||||
uint16_t scvt;
|
uint16_t scvt;
|
||||||
for (int i = 0; i < nc; i++) {
|
for (int i = 0; i < nc; i++) {
|
||||||
if (sp[i] == -INFINITY) {
|
if (wp[i] == -INFINITY) {
|
||||||
dp[i] = 0.0f;
|
dp[i] = 0.0f;
|
||||||
} else {
|
} else {
|
||||||
// const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
|
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
||||||
ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max);
|
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
||||||
memcpy(&scvt, &s, sizeof(scvt));
|
memcpy(&scvt, &s, sizeof(scvt));
|
||||||
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
||||||
sum += (ggml_float)val;
|
sum += (ggml_float)val;
|
||||||
|
@ -10622,11 +10660,12 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
static void ggml_compute_forward_soft_max(
|
static void ggml_compute_forward_soft_max(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_soft_max_f32(params, src0, dst);
|
ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
@ -13863,7 +13902,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
|
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SOFT_MAX_BACK:
|
case GGML_OP_SOFT_MAX_BACK:
|
||||||
{
|
{
|
||||||
|
@ -15899,6 +15938,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_SOFT_MAX:
|
||||||
|
{
|
||||||
|
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
||||||
|
|
||||||
|
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
||||||
|
} break;
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
||||||
|
|
13
ggml.h
13
ggml.h
|
@ -244,11 +244,10 @@
|
||||||
#define GGML_ASSERT(x) \
|
#define GGML_ASSERT(x) \
|
||||||
do { \
|
do { \
|
||||||
if (!(x)) { \
|
if (!(x)) { \
|
||||||
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
|
||||||
fflush(stderr); \
|
|
||||||
fflush(stdout); \
|
fflush(stdout); \
|
||||||
|
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||||
ggml_print_backtrace(); \
|
ggml_print_backtrace(); \
|
||||||
exit(1); \
|
abort(); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
@ -1283,6 +1282,14 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
// fused soft_max(a*scale + mask)
|
||||||
|
// mask is optional
|
||||||
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * mask,
|
||||||
|
float scale);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
|
@ -92,6 +92,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
|
QWEN = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -132,6 +133,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -317,6 +319,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.QWEN: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.GPT2: [
|
MODEL_ARCH.GPT2: [
|
||||||
# TODO
|
# TODO
|
||||||
],
|
],
|
||||||
|
@ -336,6 +352,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_ARCH.PERSIMMON: [
|
MODEL_ARCH.PERSIMMON: [
|
||||||
MODEL_TENSOR.ROPE_FREQS,
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.QWEN: [
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
|
@ -10,7 +10,7 @@ class TensorNameMap:
|
||||||
# Token embeddings
|
# Token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD: (
|
MODEL_TENSOR.TOKEN_EMBD: (
|
||||||
"gpt_neox.embed_in", # gptneox
|
"gpt_neox.embed_in", # gptneox
|
||||||
"transformer.wte", # gpt2 gpt-j mpt refact
|
"transformer.wte", # gpt2 gpt-j mpt refact qwen
|
||||||
"transformer.word_embeddings", # falcon
|
"transformer.word_embeddings", # falcon
|
||||||
"word_embeddings", # bloom
|
"word_embeddings", # bloom
|
||||||
"model.embed_tokens", # llama-hf
|
"model.embed_tokens", # llama-hf
|
||||||
|
@ -38,7 +38,7 @@ class TensorNameMap:
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen
|
||||||
"output", # llama-pth bloom
|
"output", # llama-pth bloom
|
||||||
"word_embeddings_for_head", # persimmon
|
"word_embeddings_for_head", # persimmon
|
||||||
),
|
),
|
||||||
|
@ -51,7 +51,7 @@ class TensorNameMap:
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
"embeddings.LayerNorm", # bert
|
"embeddings.LayerNorm", # bert
|
||||||
"transformer.norm_f", # mpt
|
"transformer.norm_f", # mpt
|
||||||
"ln_f", # refact bloom
|
"ln_f", # refact bloom qwen
|
||||||
"language_model.encoder.final_layernorm", # persimmon
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@ class TensorNameMap:
|
||||||
# Attention norm
|
# Attention norm
|
||||||
MODEL_TENSOR.ATTN_NORM: (
|
MODEL_TENSOR.ATTN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
|
||||||
"transformer.blocks.{bid}.norm_1", # mpt
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||||
"h.{bid}.input_layernorm", # bloom
|
"h.{bid}.input_layernorm", # bloom
|
||||||
|
@ -85,7 +85,7 @@ class TensorNameMap:
|
||||||
# Attention query-key-value
|
# Attention query-key-value
|
||||||
MODEL_TENSOR.ATTN_QKV: (
|
MODEL_TENSOR.ATTN_QKV: (
|
||||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen
|
||||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
"h.{bid}.self_attention.query_key_value", # bloom
|
"h.{bid}.self_attention.query_key_value", # bloom
|
||||||
|
@ -119,7 +119,7 @@ class TensorNameMap:
|
||||||
# Attention output
|
# Attention output
|
||||||
MODEL_TENSOR.ATTN_OUT: (
|
MODEL_TENSOR.ATTN_OUT: (
|
||||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
"transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen
|
||||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
"h.{bid}.self_attention.dense", # bloom
|
"h.{bid}.self_attention.dense", # bloom
|
||||||
|
@ -139,7 +139,7 @@ class TensorNameMap:
|
||||||
# Feed-forward norm
|
# Feed-forward norm
|
||||||
MODEL_TENSOR.FFN_NORM: (
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_2", # gpt2 refact
|
"transformer.h.{bid}.ln_2", # gpt2 refact qwen
|
||||||
"h.{bid}.post_attention_layernorm", # bloom
|
"h.{bid}.post_attention_layernorm", # bloom
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
|
@ -161,18 +161,20 @@ class TensorNameMap:
|
||||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||||
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
|
"transformer.h.{bid}.mlp.w1", # qwen
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward gate
|
# Feed-forward gate
|
||||||
MODEL_TENSOR.FFN_GATE: (
|
MODEL_TENSOR.FFN_GATE: (
|
||||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||||
|
"transformer.h.{bid}.mlp.w2", # qwen
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
MODEL_TENSOR.FFN_DOWN: (
|
MODEL_TENSOR.FFN_DOWN: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||||
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen
|
||||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||||
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
||||||
|
|
278
llama.cpp
278
llama.cpp
|
@ -46,7 +46,6 @@
|
||||||
#endif
|
#endif
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#include <io.h>
|
#include <io.h>
|
||||||
#include <stdio.h> // for _fseeki64
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
@ -193,6 +192,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_REFACT,
|
LLM_ARCH_REFACT,
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
LLM_ARCH_STABLELM,
|
LLM_ARCH_STABLELM,
|
||||||
|
LLM_ARCH_QWEN,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -209,6 +209,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_REFACT, "refact" },
|
{ LLM_ARCH_REFACT, "refact" },
|
||||||
{ LLM_ARCH_BLOOM, "bloom" },
|
{ LLM_ARCH_BLOOM, "bloom" },
|
||||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||||
|
{ LLM_ARCH_QWEN, "qwen" },
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llm_kv {
|
enum llm_kv {
|
||||||
|
@ -519,6 +520,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_QWEN,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
|
@ -1249,6 +1266,9 @@ struct llama_layer {
|
||||||
struct ggml_tensor * wqkv;
|
struct ggml_tensor * wqkv;
|
||||||
|
|
||||||
// attention bias
|
// attention bias
|
||||||
|
struct ggml_tensor * bq;
|
||||||
|
struct ggml_tensor * bk;
|
||||||
|
struct ggml_tensor * bv;
|
||||||
struct ggml_tensor * bo;
|
struct ggml_tensor * bo;
|
||||||
struct ggml_tensor * bqkv;
|
struct ggml_tensor * bqkv;
|
||||||
|
|
||||||
|
@ -1971,10 +1991,13 @@ struct llama_model_loader {
|
||||||
return tensor;
|
return tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
||||||
|
|
||||||
if (cur == NULL) {
|
if (cur == NULL) {
|
||||||
|
if (!required) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
|
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2348,6 +2371,15 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_QWEN:
|
||||||
|
{
|
||||||
|
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 32: model.type = e_model::MODEL_7B; break;
|
||||||
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
|
||||||
default: (void)0;
|
default: (void)0;
|
||||||
}
|
}
|
||||||
|
@ -2783,6 +2815,12 @@ static void llm_load_tensors(
|
||||||
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
||||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
|
|
||||||
|
// optional bias tensors
|
||||||
|
layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
|
||||||
|
layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
|
||||||
|
layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
|
||||||
|
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
|
||||||
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
@ -2792,8 +2830,13 @@ static void llm_load_tensors(
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
vram_weights +=
|
vram_weights +=
|
||||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||||
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
|
||||||
ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
(layer.bq ? ggml_nbytes(layer.bq) : 0) +
|
||||||
|
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
||||||
|
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
||||||
|
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
||||||
|
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
|
||||||
|
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -3311,6 +3354,71 @@ static void llm_load_tensors(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_QWEN:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
|
{
|
||||||
|
ggml_backend_type backend_norm;
|
||||||
|
ggml_backend_type backend_output;
|
||||||
|
|
||||||
|
if (n_gpu_layers > int(n_layer)) {
|
||||||
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||||
|
// on Windows however this is detrimental unless everything is on the GPU
|
||||||
|
#ifndef _WIN32
|
||||||
|
backend_norm = llama_backend_offload;
|
||||||
|
#else
|
||||||
|
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
||||||
|
#endif // _WIN32
|
||||||
|
|
||||||
|
backend_output = llama_backend_offload_split;
|
||||||
|
} else {
|
||||||
|
backend_norm = GGML_BACKEND_CPU;
|
||||||
|
backend_output = GGML_BACKEND_CPU;
|
||||||
|
}
|
||||||
|
|
||||||
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||||
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
|
}
|
||||||
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
vram_weights += ggml_nbytes(model.output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t n_ff = hparams.n_ff / 2;
|
||||||
|
|
||||||
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
|
||||||
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
|
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
||||||
|
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
|
||||||
|
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
|
||||||
|
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
|
||||||
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
|
||||||
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights +=
|
||||||
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
||||||
|
ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
|
||||||
|
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("unknown architecture");
|
throw std::runtime_error("unknown architecture");
|
||||||
|
@ -3705,6 +3813,8 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||||
cb(kq, "kq", il);
|
cb(kq, "kq", il);
|
||||||
|
|
||||||
|
if (max_alibi_bias > 0.0f) {
|
||||||
|
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
||||||
kq = ggml_scale(ctx, kq, kq_scale);
|
kq = ggml_scale(ctx, kq, kq_scale);
|
||||||
cb(kq, "kq_scaled", il);
|
cb(kq, "kq_scaled", il);
|
||||||
|
|
||||||
|
@ -3721,6 +3831,10 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
|
|
||||||
kq = ggml_soft_max(ctx, kq);
|
kq = ggml_soft_max(ctx, kq);
|
||||||
cb(kq, "kq_soft_max", il);
|
cb(kq, "kq_soft_max", il);
|
||||||
|
} else {
|
||||||
|
kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
|
||||||
|
cb(kq, "kq_soft_max_ext", il);
|
||||||
|
}
|
||||||
|
|
||||||
// split cached v into n_head heads
|
// split cached v into n_head heads
|
||||||
struct ggml_tensor * v =
|
struct ggml_tensor * v =
|
||||||
|
@ -3886,12 +4000,24 @@ struct llm_build_context {
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
|
@ -3910,7 +4036,7 @@ struct llm_build_context {
|
||||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||||
|
|
||||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
@ -4903,6 +5029,121 @@ struct llm_build_context {
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_qwen() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||||
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||||
|
cb(inp_pos, "inp_pos", -1);
|
||||||
|
|
||||||
|
// KQ_scale
|
||||||
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||||
|
cb(KQ_scale, "KQ_scale", -1);
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it wil be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||||
|
cb(KQ_mask, "KQ_mask", -1);
|
||||||
|
|
||||||
|
// shift the entire K-cache if needed
|
||||||
|
if (do_rope_shift) {
|
||||||
|
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||||
|
cb(cur, "wqkv", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||||
|
cb(cur, "bqkv", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||||
|
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||||
|
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||||
|
|
||||||
|
// using mode = 2 for neox mode
|
||||||
|
Qcur = ggml_rope_custom(
|
||||||
|
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
||||||
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_custom(
|
||||||
|
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
||||||
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||||
|
|
||||||
|
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||||
|
model.layers[il].wo, NULL,
|
||||||
|
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||||
|
cb(cur, "kqv_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward forward
|
||||||
|
{
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
|
model.layers[il].ffn_up, NULL,
|
||||||
|
model.layers[il].ffn_gate, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -5042,6 +5283,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
||||||
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
|
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
|
||||||
{ "kq_masked", OFFLOAD_FUNC_KQ },
|
{ "kq_masked", OFFLOAD_FUNC_KQ },
|
||||||
{ "kq_soft_max", OFFLOAD_FUNC_V },
|
{ "kq_soft_max", OFFLOAD_FUNC_V },
|
||||||
|
{ "kq_soft_max_ext", OFFLOAD_FUNC_V },
|
||||||
{ "v", OFFLOAD_FUNC_V },
|
{ "v", OFFLOAD_FUNC_V },
|
||||||
{ "kqv", OFFLOAD_FUNC_V },
|
{ "kqv", OFFLOAD_FUNC_V },
|
||||||
{ "kqv_merged", OFFLOAD_FUNC_V },
|
{ "kqv_merged", OFFLOAD_FUNC_V },
|
||||||
|
@ -5376,6 +5618,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_stablelm();
|
result = llm.build_stablelm();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_QWEN:
|
||||||
|
{
|
||||||
|
result = llm.build_qwen();
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
@ -7027,6 +7273,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
||||||
// Replace the data in candidates with the new_candidates data
|
// Replace the data in candidates with the new_candidates data
|
||||||
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
||||||
candidates->size = new_candidates.size();
|
candidates->size = new_candidates.size();
|
||||||
|
candidates->sorted = false;
|
||||||
|
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
|
@ -7648,18 +7895,21 @@ static void llama_convert_tensor_internal(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
||||||
auto block_size_bytes = ggml_type_size(tensor->type);
|
size_t block_size_bytes = ggml_type_size(tensor->type);
|
||||||
|
|
||||||
GGML_ASSERT(nelements % block_size == 0);
|
GGML_ASSERT(nelements % block_size == 0);
|
||||||
auto nblocks = nelements / block_size;
|
size_t nblocks = nelements / block_size;
|
||||||
auto blocks_per_thread = nblocks / nthread;
|
size_t blocks_per_thread = nblocks / nthread;
|
||||||
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
||||||
|
|
||||||
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
size_t in_buff_offs = 0;
|
||||||
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
size_t out_buff_offs = 0;
|
||||||
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
|
||||||
auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
for (int tnum = 0; tnum < nthread; tnum++) {
|
||||||
|
size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
||||||
|
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
|
||||||
|
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
||||||
|
|
||||||
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
||||||
if (typ == GGML_TYPE_F16) {
|
if (typ == GGML_TYPE_F16) {
|
||||||
|
|
1
prompts/chat-with-qwen.txt
Normal file
1
prompts/chat-with-qwen.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
You are a helpful assistant.
|
3
requirements-hf-to-gguf.txt
Normal file
3
requirements-hf-to-gguf.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
-r requirements.txt
|
||||||
|
torch==2.1.1
|
||||||
|
transformers==4.35.2
|
Loading…
Add table
Add a link
Reference in a new issue