diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..113ca5fd3 --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 125 diff --git a/.gitignore b/.gitignore index b3ff6526c..e7bfd52e3 100644 --- a/.gitignore +++ b/.gitignore @@ -34,8 +34,10 @@ models/* /perplexity /embedding /train-text-from-scratch +/simple /benchmark-matmult /vdot +/server /Pipfile /libllama.so diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..65796fe2e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,15 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +exclude: prompts/.*.txt +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.2.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files +- repo: https://github.com/PyCQA/flake8 + rev: 6.0.0 + hooks: + - id: flake8 diff --git a/CMakeLists.txt b/CMakeLists.txt index ea9f80b80..f5a968533 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") +set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_K_QUANTS "llama: use k-quants" ON) @@ -158,23 +159,59 @@ if (LLAMA_BLAS) if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22) set(BLA_SIZEOF_INTEGER 8) endif() + set(BLA_VENDOR ${LLAMA_BLAS_VENDOR}) find_package(BLAS) + if (BLAS_FOUND) message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. - # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 - find_path(BLAS_INCLUDE_DIRS - NAMES cblas.h - HINTS - /usr/include - /usr/local/include - /usr/include/openblas - ) + if ("${BLAS_INCLUDE_DIRS}" STREQUAL "") + # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. + # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 + find_package(PkgConfig REQUIRED) + if (${LLAMA_BLAS_VENDOR} MATCHES "Generic") + pkg_check_modules(DepBLAS REQUIRED blas) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS") + pkg_check_modules(DepBLAS REQUIRED openblas) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME") + pkg_check_modules(DepBLAS REQUIRED blis) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS") + pkg_check_modules(DepBLAS REQUIRED blas-atlas) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS") + pkg_check_modules(DepBLAS REQUIRED flexiblas_api) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel") + # all Intel* libraries share the same include path + pkg_check_modules(DepBLAS REQUIRED mkl-sdl) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC") + # this doesn't provide pkg-config + # suggest to assign BLAS_INCLUDE_DIRS on your own + if ("${NVHPC_VERSION}" STREQUAL "") + message(WARNING "Better to set NVHPC_VERSION") + else() + set(DepBLAS_FOUND ON) + set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include") + endif() + endif() + if (DepBLAS_FOUND) + set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS}) + else() + message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically" + " detected by pkgconfig, trying to find cblas.h from possible paths...") + find_path(BLAS_INCLUDE_DIRS + NAMES cblas.h + HINTS + /usr/include + /usr/local/include + /usr/include/openblas + /opt/homebrew/opt/openblas/include + /usr/local/opt/openblas/include + /usr/include/x86_64-linux-gnu/openblas/include + ) + endif() + endif() message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") - add_compile_options(${BLAS_LINKER_FLAGS}) add_compile_definitions(GGML_USE_OPENBLAS) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) @@ -201,6 +238,7 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) + add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) if (LLAMA_STATIC) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) @@ -423,8 +461,10 @@ target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_compile_features(ggml PUBLIC c_std_11) # don't bump target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) +add_library(ggml_static STATIC $) if (BUILD_SHARED_LIBS) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) + add_library(ggml_shared SHARED $) endif() add_library(llama diff --git a/Makefile b/Makefile index 09c8834f5..72d6ad40c 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,10 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple ifdef LLAMA_BUILD_SERVER BUILD_TARGETS += server + LLAMA_SERVER_VERBOSE ?= 1 +server: private CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) endif default: $(BUILD_TARGETS) @@ -142,11 +144,7 @@ endif # LLAMA_NO_ACCELERATE ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas - ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),) - LDFLAGS += -lopenblas -lcblas - else - LDFLAGS += -lopenblas - endif + LDFLAGS += -lopenblas endif # LLAMA_OPENBLAS ifdef LLAMA_BLIS @@ -171,6 +169,11 @@ ifdef LLAMA_CUDA_DMMV_Y else NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1 endif # LLAMA_CUDA_DMMV_Y +ifdef LLAMA_CUDA_KQUANTS_ITER + NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) +else + NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 +endif ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ endif # LLAMA_CUBLAS @@ -271,6 +274,12 @@ main: examples/main/main.cpp build-info.h ggml. @echo '==== Run ./main -h for help. ====' @echo +simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + @echo + @echo '==== Run ./simple -h for help. ====' + @echo + quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/README.md b/README.md index cc3bd5394..7defb7584 100644 --- a/README.md +++ b/README.md @@ -336,7 +336,6 @@ Building the program with BLAS support may lead to some performance improvements cmake .. -DLLAMA_CUBLAS=ON cmake --build . --config Release ``` - Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1. The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. @@ -616,6 +615,7 @@ And after 4.45 hours, you will have the final perplexity. ### Android +#### Building the Project using Android NDK You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/). First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake: ``` @@ -630,6 +630,46 @@ Finally, copy the `llama` binary and the model files to your device storage. Her https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 +#### Building the Project using Termux (F-Droid) +Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card. + +Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU. + +If you opt to utilize OpenBLAS, you'll need to install the corresponding package. +``` +apt install libopenblas +``` + +Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages: +``` +apt install ocl-icd opencl-headers opencl-clhpp clinfo +``` + +In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below: +``` +cmake . +make +cp libclblast.so* $PREFIX/lib +cp ./include/clblast.h ../llama.cpp +``` + +Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below: +``` +cp /data/data/com.termux/files/usr/include/openblas/cblas.h . +cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h . +make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice) +``` + +Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below: +``` +GGML_OPENCL_PLATFORM=0 +GGML_OPENCL_DEVICE=0 +export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH +./main (...) +``` + +For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle. + ### Docker #### Prerequisites diff --git a/convert.py b/convert.py index ece5a0266..265c41fa0 100644 --- a/convert.py +++ b/convert.py @@ -512,7 +512,11 @@ class LazyTensor: if not isinstance(self.data_type, QuantizedDataType): raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})") if self.data_type.have_g_idx: - sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML. For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n") + sys.stderr.write( + "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), " + "which is not yet natively supported by GGML. " + "For now you can still convert this model by passing `--outtype f16` to dequantize, " + "but that will result in a much larger output file for no quality benefit.\n") sys.exit(1) assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends @@ -694,8 +698,9 @@ class LazyUnpickler(pickle.Unpickler): description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' return LazyStorage(load=load, kind=pid[1], description=description) - # @staticmethod - def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName] + # @staticmethod + def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, + # pyright: ignore[reportSelfClsParameterName] requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: assert isinstance(storage, LazyStorage) @@ -812,7 +817,7 @@ def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus: # Use mmap for the actual data to avoid race conditions with the file offset. off = fp.raw.tell() mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)) - fp.raw.seek(off) # needed on Windows + fp.raw.seek(off) # needed on Windows def read_tensor() -> None: # this is a function so that variables captured in `load` don't change shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12)) @@ -1054,7 +1059,7 @@ def load_some_model(path: Path) -> ModelPlus: files = list(path.glob("model-00001-of-*.safetensors")) if not files: # Try the PyTorch patterns too, with lower priority - globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ] + globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] files = [file for glob in globs for file in path.glob(glob)] if not files: # Try GGML too, but with lower priority, since if both a non-GGML @@ -1094,7 +1099,9 @@ def load_vocab(path: Path) -> SentencePieceVocab: elif path3.exists(): path = path3 else: - raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir") + raise FileNotFoundError( + f"Could not find tokenizer.model in {path} or its parent; " + "if it's in another directory, pass the directory as --vocab-dir") added_tokens_path = path.parent / "added_tokens.json" print(f"Loading vocab file {path}") return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) @@ -1110,7 +1117,9 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path: }[params.file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.bin" if ret in model_paths: - sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input. Please explicitly specify a path using --outfile.\n") + sys.stderr.write( + f"Error: Default output path ({ret}) would overwrite the input. " + "Please explicitly specify a path using --outfile.\n") sys.exit(1) return ret @@ -1131,7 +1140,8 @@ def main(args_in: Optional[List[str]] = None) -> None: parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + parser.add_argument("model", type=Path, + help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") args = parser.parse_args(args_in) vocab: Vocab diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index de005f3e3..cf9c4a223 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -38,6 +38,7 @@ else() add_subdirectory(benchmark) add_subdirectory(baby-llama) add_subdirectory(train-text-from-scratch) + add_subdirectory(simple) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 0add6adc0..50e14c4ac 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -4,6 +4,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + float frand() { return (float)rand()/(float)RAND_MAX; } @@ -1470,7 +1474,7 @@ struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_te } struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - const float eps = 1e-3; + const float eps = 1e-3f; return ggml_sum(ctx, ggml_neg(ctx, diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 9f9ed9db0..39d15caeb 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -16,6 +16,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + float tensor_sum_elements(const ggml_tensor * tensor) { float sum = 0; if (tensor->type==GGML_TYPE_F32) { @@ -29,9 +33,9 @@ float tensor_sum_elements(const ggml_tensor * tensor) { } void tensor_dump(const ggml_tensor * tensor, const char * name) { - printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", name, + printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name, tensor->type, ggml_type_name(tensor->type), - (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); float sum = tensor_sum_elements(tensor); printf("Sum of tensor %s is %6.2f\n", name, sum); } @@ -120,7 +124,7 @@ int main(int argc, char ** argv) { ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += 1024*1024*16; - printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024)); + printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); struct ggml_init_params params = { /*.mem_size =*/ ctx_size, diff --git a/examples/common.cpp b/examples/common.cpp index b47f06273..fed24e027 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -28,6 +28,10 @@ #include #endif +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -102,9 +106,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } if (arg == "-s" || arg == "--seed") { -#if defined(GGML_USE_CUBLAS) - fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n"); -#endif if (++i >= argc) { invalid_param = true; break; @@ -373,7 +374,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else { throw std::exception(); } - } catch (const std::exception &e) { + } catch (const std::exception&) { invalid_param = true; break; } diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 03603b10f..860f99f67 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -4,6 +4,10 @@ #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + int main(int argc, char ** argv) { gpt_params params; diff --git a/examples/jeopardy/graph.py b/examples/jeopardy/graph.py index d00b28652..1b6c54bff 100644 --- a/examples/jeopardy/graph.py +++ b/examples/jeopardy/graph.py @@ -1,5 +1,5 @@ import matplotlib.pyplot as plt -import sys, os +import os import csv labels = [] @@ -8,6 +8,7 @@ numEntries = 1 rows = [] + def bar_chart(numbers, labels, pos): plt.bar(pos, numbers, color='blue') plt.xticks(ticks=pos, labels=labels) @@ -16,6 +17,7 @@ def bar_chart(numbers, labels, pos): plt.ylabel("Questions Correct") plt.show() + def calculatecorrect(): directory = os.fsencode("./examples/jeopardy/results/") csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',') @@ -38,14 +40,13 @@ def calculatecorrect(): print(line) else: print("Correct answer: " + rows[i][2] + "\n") - i+=1 + i += 1 print("Did the AI get the question right? (y/n)") if input() == "y": totalcorrect += 1 numbers.append(totalcorrect) - if __name__ == '__main__': calculatecorrect() pos = list(range(numEntries)) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index efa913e16..941312f9c 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -23,11 +23,17 @@ #include #elif defined (_WIN32) #define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX #define NOMINMAX +#endif #include #include #endif +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + static console_state con_st; static llama_context ** g_ctx; @@ -348,7 +354,7 @@ int main(int argc, char ** argv) { if ((int)embd.size() > max_embd_size) { auto skipped_tokens = embd.size() - max_embd_size; console_set_color(con_st, CONSOLE_COLOR_ERROR); - printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); console_set_color(con_st, CONSOLE_COLOR_DEFAULT); fflush(stdout); embd.resize(max_embd_size); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index e19c6825f..ae8cfe0af 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -5,6 +5,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); float max_logit = logits[0]; diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6e4f7e1e0..6b8018ee2 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -19,6 +19,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + struct quantize_stats_params { std::string model = "models/7B/ggml-model-f16.bin"; bool verbose = false; diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 91f04b6c7..da4d37ad0 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -37,7 +37,7 @@ int main(int argc, char ** argv) { // init auto ctx = llama_init_from_file(params.model.c_str(), lparams); auto tokens = std::vector(params.n_ctx); - auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true); + auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true); if (n_prompt_tokens < 1) { fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index bd65c84b1..07ba76ad3 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,6 +1,10 @@ set(TARGET server) +option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp json.hpp httplib.h) +target_compile_definitions(${TARGET} PRIVATE + SERVER_VERBOSE=$ +) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/server/README.md b/examples/server/README.md index 3b111655a..474a28b20 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -1,37 +1,74 @@ # llama.cpp/example/server -This example allow you to have a llama.cpp http server to interact from a web page or consume the API. +This example demonstrates a simple HTTP API server to interact with llama.cpp. -## Table of Contents +Command line options: -1. [Quick Start](#quick-start) -2. [Node JS Test](#node-js-test) -3. [API Endpoints](#api-endpoints) -4. [More examples](#more-examples) -5. [Common Options](#common-options) -6. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options) +- `--threads N`, `-t N`: Set the number of threads to use during computation. +- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. +- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. +- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS. +- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`. +- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. +- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. +- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. +- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. +- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. +- `--port`: Set the port to listen. Default: `8080`. + +## Build + +Build llama.cpp with server from repository root with either make or CMake. + +- Using `make`: + + ```bash + LLAMA_BUILD_SERVER=1 make + ``` + +- Using `CMake`: + + ```bash + mkdir build-server + cd build-server + cmake -DLLAMA_BUILD_SERVER=ON .. + cmake --build . --config Release + ``` ## Quick Start To get started right away, run the following command, making sure to use the correct path for the model you have: -#### Unix-based systems (Linux, macOS, etc.): -Make sure to build with the server option on -```bash -LLAMA_BUILD_SERVER=1 make -``` +### Unix-based systems (Linux, macOS, etc.): ```bash -./server -m models/7B/ggml-model.bin --ctx_size 2048 +./server -m models/7B/ggml-model.bin -c 2048 ``` -#### Windows: +### Windows: ```powershell -server.exe -m models\7B\ggml-model.bin --ctx_size 2048 +server.exe -m models\7B\ggml-model.bin -c 2048 ``` -That will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library. +The above command will start a server that by default listens on `127.0.0.1:8080`. +You can consume the endpoints with Postman or NodeJS with axios library. + +## Testing with CURL + +Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS. + +```sh +curl --request POST \ + --url http://localhost:8080/completion \ + --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' +``` ## Node JS Test @@ -54,7 +91,6 @@ const prompt = `Building a website can be done in 10 simple steps:`; async function Test() { let result = await axios.post("http://127.0.0.1:8080/completion", { prompt, - batch_size: 128, n_predict: 512, }); @@ -73,247 +109,75 @@ node . ## API Endpoints -You can interact with this API Endpoints. This implementations just support chat style interaction. +- **POST** `/completion`: Given a prompt, it returns the predicted completion. -- **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks. + *Options:* -*Options:* + `temperature`: Adjust the randomness of the generated text (default: 0.8). -`batch_size`: Set the batch size for prompt processing (default: 512). + `top_k`: Limit the next token selection to the K most probable tokens (default: 40). -`temperature`: Adjust the randomness of the generated text (default: 0.8). + `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). -`top_k`: Limit the next token selection to the K most probable tokens (default: 40). + `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity). -`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). + `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. + By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. -`n_predict`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity). + `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. -`threads`: Set the number of threads to use during computation. + `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. -`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. + `stop`: Specify a JSON array of stopping strings. + These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). -`as_loop`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. + `tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). -`interactive`: It allows interacting with the completion, and the completion stops as soon as it encounters a `stop word`. To enable this, set to `true`. + `typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled). -`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. + `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1). -`stop`: Specify the words or characters that indicate a stop. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. + `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). -`exclude`: Specify the words or characters you do not want to appear in the completion. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. + `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true). -- **POST** `hostname:port/embedding`: Generate embedding of a given text + `presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled). -*Options:* + `frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled); -`content`: Set the text to get generate the embedding. + `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0). -`threads`: Set the number of threads to use during computation. + `mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0). -To use this endpoint, you need to start the server with the `--embedding` option added. + `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1). -- **POST** `hostname:port/tokenize`: Tokenize a given text + `seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed). -*Options:* + `ignore_eos`: Ignore end of stream token and continue generating (default: false). -`content`: Set the text to tokenize. + `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []). -- **GET** `hostname:port/next-token`: Receive the next token predicted, execute this request in a loop. Make sure set `as_loop` as `true` in the completion request. +- **POST** `/tokenize`: Tokenize a given text. -*Options:* + *Options:* -`stop`: Set `hostname:port/next-token?stop=true` to stop the token generation. + `content`: Set the text to tokenize. ## More examples ### Interactive mode -This mode allows interacting in a chat-like manner. It is recommended for models designed as assistants such as `Vicuna`, `WizardLM`, `Koala`, among others. Make sure to add the correct stop word for the corresponding model. +Check the sample in [chat.mjs](chat.mjs). +Run with NodeJS version 16 or later: -The prompt should be generated by you, according to the model's guidelines. You should keep adding the model's completions to the context as well. - -This example works well for `Vicuna - version 1`. - -```javascript -const axios = require("axios"); - -let prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. -### Human: Hello, Assistant. -### Assistant: Hello. How may I help you today? -### Human: Please tell me the largest city in Europe. -### Assistant: Sure. The largest city in Europe is Moscow, the capital of Russia.`; - -async function ChatCompletion(answer) { - // the user's next question to the prompt - prompt += `\n### Human: ${answer}\n` - - result = await axios.post("http://127.0.0.1:8080/completion", { - prompt, - batch_size: 128, - temperature: 0.2, - top_k: 40, - top_p: 0.9, - n_keep: -1, - n_predict: 2048, - stop: ["\n### Human:"], // when detect this, stop completion - exclude: ["### Assistant:"], // no show in the completion - threads: 8, - as_loop: true, // use this to request the completion token by token - interactive: true, // enable the detection of a stop word - }); - - // create a loop to receive every token predicted - // note: this operation is blocking, avoid use this in a ui thread - - let message = ""; - while (true) { - // you can stop the inference adding '?stop=true' like this http://127.0.0.1:8080/next-token?stop=true - result = await axios.get("http://127.0.0.1:8080/next-token"); - process.stdout.write(result.data.content); - message += result.data.content; - - // to avoid an infinite loop - if (result.data.stop) { - console.log("Completed"); - // make sure to add the completion to the prompt. - prompt += `### Assistant: ${message}`; - break; - } - } -} - -// This function should be called every time a question to the model is needed. -async function Test() { - // the server can't inference in paralell - await ChatCompletion("Write a long story about a time magician in a fantasy world"); - await ChatCompletion("Summary the story"); -} - -Test(); +```sh +node chat.mjs ``` -### Alpaca example +Another sample in [chat.sh](chat.sh). +Requires [bash](https://www.gnu.org/software/bash/), [curl](https://curl.se) and [jq](https://jqlang.github.io/jq/). +Run with bash: -**Temporaly note:** no tested, if you have the model, please test it and report me some issue - -```javascript -const axios = require("axios"); - -let prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request. -`; - -async function DoInstruction(instruction) { - prompt += `\n\n### Instruction:\n\n${instruction}\n\n### Response:\n\n`; - result = await axios.post("http://127.0.0.1:8080/completion", { - prompt, - batch_size: 128, - temperature: 0.2, - top_k: 40, - top_p: 0.9, - n_keep: -1, - n_predict: 2048, - stop: ["### Instruction:\n\n"], // when detect this, stop completion - exclude: [], // no show in the completion - threads: 8, - as_loop: true, // use this to request the completion token by token - interactive: true, // enable the detection of a stop word - }); - - // create a loop to receive every token predicted - // note: this operation is blocking, avoid use this in a ui thread - - let message = ""; - while (true) { - result = await axios.get("http://127.0.0.1:8080/next-token"); - process.stdout.write(result.data.content); - message += result.data.content; - - // to avoid an infinite loop - if (result.data.stop) { - console.log("Completed"); - // make sure to add the completion and the user's next question to the prompt. - prompt += message; - break; - } - } -} - -// This function should be called every time a instruction to the model is needed. -DoInstruction("Destroy the world"); // as joke +```sh +bash chat.sh ``` - -### Embeddings - -First, run the server with `--embedding` option: - -```bash -server -m models/7B/ggml-model.bin --ctx_size 2048 --embedding -``` - -Run this code in NodeJS: - -```javascript -const axios = require('axios'); - -async function Test() { - let result = await axios.post("http://127.0.0.1:8080/embedding", { - content: `Hello`, - threads: 5 - }); - // print the embedding array - console.log(result.data.embedding); -} - -Test(); -``` - -### Tokenize - -Run this code in NodeJS: - -```javascript -const axios = require('axios'); - -async function Test() { - let result = await axios.post("http://127.0.0.1:8080/tokenize", { - content: `Hello` - }); - // print the embedding array - console.log(result.data.tokens); -} - -Test(); -``` - -## Common Options - -- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). -- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. -- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. -- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS. -- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**. -- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`; -- `--port`: Set the port to listen. Default: `8080`. - -### RNG Seed - -- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed). - -The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run. - -## Performance Tuning and Memory Options - -### No Memory Mapping - -- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. - -### Memory Float 32 - -- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement but does not appear to increase generation quality in a measurable way. Not recommended. - -## Limitations: - -- The actual implementation of llama.cpp need a `llama-state` for handle multiple contexts and clients, but this could require more powerful hardware. diff --git a/examples/server/chat.mjs b/examples/server/chat.mjs new file mode 100644 index 000000000..8269e2592 --- /dev/null +++ b/examples/server/chat.mjs @@ -0,0 +1,89 @@ +import * as readline from 'node:readline' +import { stdin, stdout } from 'node:process' + +const API_URL = 'http://127.0.0.1:8080' + +const chat = [ + { + human: "Hello, Assistant.", + assistant: "Hello. How may I help you today?" + }, + { + human: "Please tell me the largest city in Europe.", + assistant: "Sure. The largest city in Europe is Moscow, the capital of Russia." + }, +] + +const instruction = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.` + +function format_prompt(question) { + return `${instruction}\n${ + chat.map(m =>`### Human: ${m.human}\n### Assistant: ${m.assistant}`).join("\n") + }\n### Human: ${question}\n### Assistant:` +} + +async function tokenize(content) { + const result = await fetch(`${API_URL}/tokenize`, { + method: 'POST', + body: JSON.stringify({ content }) + }) + + if (!result.ok) { + return [] + } + + return await result.json().tokens +} + +const n_keep = await tokenize(instruction).length + +async function chat_completion(question) { + const result = await fetch(`${API_URL}/completion`, { + method: 'POST', + body: JSON.stringify({ + prompt: format_prompt(question), + temperature: 0.2, + top_k: 40, + top_p: 0.9, + n_keep: n_keep, + n_predict: 256, + stop: ["\n### Human:"], // stop completion after generating this + stream: true, + }) + }) + + if (!result.ok) { + return + } + + let answer = '' + + for await (var chunk of result.body) { + const t = Buffer.from(chunk).toString('utf8') + if (t.startsWith('data: ')) { + const message = JSON.parse(t.substring(6)) + answer += message.content + process.stdout.write(message.content) + if (message.stop) { + if (message.truncated) { + chat.shift() + } + break + } + } + } + + process.stdout.write('\n') + chat.push({ human: question, assistant: answer.trimStart() }) +} + +const rl = readline.createInterface({ input: stdin, output: stdout }); + +const readlineQuestion = (rl, query, options) => new Promise((resolve, reject) => { + rl.question(query, options, resolve) +}); + +while(true) { + const question = await readlineQuestion(rl, '> ') + await chat_completion(question) +} diff --git a/examples/server/chat.sh b/examples/server/chat.sh new file mode 100644 index 000000000..a89f8e908 --- /dev/null +++ b/examples/server/chat.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +API_URL="${API_URL:-http://127.0.0.1:8080}" + +CHAT=( + "Hello, Assistant." + "Hello. How may I help you today?" + "Please tell me the largest city in Europe." + "Sure. The largest city in Europe is Moscow, the capital of Russia." +) + +INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." + +trim() { + shopt -s extglob + set -- "${1##+([[:space:]])}" + printf "%s" "${1%%+([[:space:]])}" +} + +trim_trailing() { + shopt -s extglob + printf "%s" "${1%%+([[:space:]])}" +} + +format_prompt() { + echo -n "${INSTRUCTION}" + printf "\n### Human: %s\n### Assistant: %s" "${CHAT[@]}" "$1" +} + +tokenize() { + curl \ + --silent \ + --request POST \ + --url "${API_URL}/tokenize" \ + --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \ + | jq '.tokens[]' +} + +N_KEEP=$(tokenize "${INSTRUCTION}" | wc -l) + +chat_completion() { + PROMPT="$(trim_trailing "$(format_prompt "$1")")" + DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{ + prompt: ., + temperature: 0.2, + top_k: 40, + top_p: 0.9, + n_keep: $n_keep, + n_predict: 256, + stop: ["\n### Human:"], + stream: true + }')" + + ANSWER='' + + while IFS= read -r LINE; do + if [[ $LINE = data:* ]]; then + CONTENT="$(echo "${LINE:5}" | jq -r '.content')" + printf "%s" "${CONTENT}" + ANSWER+="${CONTENT}" + fi + done < <(curl \ + --silent \ + --no-buffer \ + --request POST \ + --url "${API_URL}/completion" \ + --data-raw "${DATA}") + + printf "\n" + + CHAT+=("$1" "$(trim "$ANSWER")") +} + +while true; do + read -r -e -p "> " QUESTION + chat_completion "${QUESTION}" +done diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 872750053..12d4e2fa4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,799 +1,928 @@ -#include -#include #include "common.h" #include "llama.h" +#include "build-info.h" -struct server_params -{ - std::string hostname = "127.0.0.1"; - int32_t port = 8080; -}; +// single thread +#define CPPHTTPLIB_THREAD_POOL_COUNT 1 +#ifndef NDEBUG +// crash the server in debug mode, otherwise send an http 500 error +#define CPPHTTPLIB_NO_EXCEPTIONS 1 +#endif -struct llama_server_context -{ - bool as_loop = false; - bool has_next_token = false; - std::string generated_text = ""; +#include "httplib.h" +#include "json.hpp" - int32_t num_tokens_predicted = 0; - int32_t n_past = 0; - int32_t n_consumed = 0; - int32_t n_session_consumed = 0; - int32_t n_remain = 0; - - std::vector embd; - std::vector last_n_tokens; - std::vector processed_tokens; - std::vector llama_token_newline; - std::vector embd_inp; - std::vector> no_show_words; - std::vector tokens_predicted; - - llama_context *ctx; - gpt_params params; - - void rewind() { - as_loop = false; - params.antiprompt.clear(); - no_show_words.clear(); - num_tokens_predicted = 0; - generated_text = ""; - } - - bool loadModel(gpt_params params_) - { - params = params_; - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) - { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return false; - } - // determine newline token - llama_token_newline = ::llama_tokenize(ctx, "\n", false); - last_n_tokens.resize(params.n_ctx); - std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); - return true; - } - - bool loadPrompt() { - params.prompt.insert(0, 1, ' '); // always add a first space - std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); - // compare the evaluated prompt with the new prompt - int new_prompt_len = 0; - for (size_t i = 0; i < prompt_tokens.size(); i++) { - if (i < processed_tokens.size() && - processed_tokens[i] == prompt_tokens[i]) - { - continue; - } - else - { - embd_inp.push_back(prompt_tokens[i]); - if(new_prompt_len == 0) { - if(int32_t(i) - 1 < n_past) { - processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end()); - } - // Evaluate the new fragment prompt from the last token processed. - n_past = processed_tokens.size(); - } - new_prompt_len ++; - } - } - if(n_past > 0 && params.interactive) { - n_remain -= new_prompt_len; - } - if ((int)embd_inp.size() > params.n_ctx - 4) - { - return false; - } - has_next_token = true; - return true; - } - - void beginCompletion() - { - if(n_remain == 0) { - // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) - { - params.n_keep = (int)embd_inp.size(); - } - } - n_remain = params.n_predict; - } - - llama_token nextToken() { - llama_token result = -1; - if (embd.size() > 0) - { - if (n_past + (int)embd.size() > params.n_ctx) - { - // Reset context - const int n_left = n_past - params.n_keep; - n_past = std::max(1, params.n_keep); - processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end()); - embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size()); - } - for (int i = 0; i < (int)embd.size(); i += params.n_batch) - { - int n_eval = (int)embd.size() - i; - if (n_eval > params.n_batch) - { - n_eval = params.n_batch; - } - if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) - { - fprintf(stderr, "%s : failed to eval\n", __func__); - has_next_token = false; - return result; - } - n_past += n_eval; - } - } - embd.clear(); - if ((int)embd_inp.size() <= n_consumed && has_next_token) - { - // out of user input, sample next token - const float temp = params.temp; - // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - const float alpha_presence = params.presence_penalty; - const float alpha_frequency = params.frequency_penalty; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - llama_token id = 0; - { - auto logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(ctx); - - // Apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) - { - logits[it->first] += it->second; - } - - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) - { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; - - // Apply penalties - float nl_logit = logits[llama_token_nl()]; - auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); - llama_sample_repetition_penalty(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, repeat_penalty); - llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, alpha_frequency, alpha_presence); - if (!penalize_nl) - { - logits[llama_token_nl()] = nl_logit; - } - - if (temp <= 0) - { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &candidates_p); - } - else - { - if (mirostat == 1) - { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } - else if (mirostat == 2) - { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } - else - { - // Temperature sampling - llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); - llama_sample_typical(ctx, &candidates_p, typical_p, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_temperature(ctx, &candidates_p, temp); - id = llama_sample_token(ctx, &candidates_p); - } - } - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(id); - processed_tokens.push_back(id); - num_tokens_predicted++; - } - - // replace end of text token with newline token when in interactive mode - if (id == llama_token_eos() && params.interactive) - { - id = llama_token_newline.front(); - if (params.antiprompt.size() != 0) - { - // tokenize and inject first reverse prompt - const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); - embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); - } - } - - // add it to the context - embd.push_back(id); - for (auto id : embd) - { - result = id; - } - // decrement remaining sampling budget - --n_remain; - } - else - { - // some user input remains from prompt or interaction, forward it to processing - while ((int)embd_inp.size() > n_consumed) - { - embd.push_back(embd_inp[n_consumed]); - last_n_tokens.erase(last_n_tokens.begin()); - last_n_tokens.push_back(embd_inp[n_consumed]); - processed_tokens.push_back(embd_inp[n_consumed]); - ++n_consumed; - if ((int)embd.size() >= params.n_batch) - { - break; - } - } - } - if (params.interactive && (int)embd_inp.size() <= n_consumed) - { - // check for reverse prompt - if (params.antiprompt.size()) - { - std::string last_output; - for (auto id : last_n_tokens) - { - last_output += llama_token_to_str(ctx, id); - } - has_next_token = true; - // Check if each of the reverse prompts appears at the end of the output. - for (std::string &antiprompt : params.antiprompt) - { - if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) - { - has_next_token = false; - return result; - } - } - } - if (n_past > 0) - { - has_next_token = true; - } - } - - if (!embd.empty() && embd.back() == llama_token_eos()) { - has_next_token = false; - } - - if (params.interactive && n_remain <= 0 && params.n_predict != -1) - { - n_remain = params.n_predict; - } - has_next_token = n_remain != 0; - return result; - } - - std::string doCompletion() - { - llama_token token = nextToken(); - if (token == -1) { - return ""; - } - tokens_predicted.clear(); - tokens_predicted.push_back(token); - - // Avoid add the no show words to the response - for (std::vector word_tokens : no_show_words) - { - size_t match_token = 1; - if (tokens_predicted.front() == word_tokens.front()) - { - bool execute_matching = true; - if (tokens_predicted.size() > 1) { // if previus tokens had been tested - for (size_t i = 1; i < word_tokens.size(); i++) - { - if (i >= tokens_predicted.size()) { - match_token = i; - break; - } - if (tokens_predicted[i] == word_tokens[i]) - { - continue; - } - else - { - execute_matching = false; - break; - } - } - } - while (execute_matching) { - if (match_token == word_tokens.size()) { - return ""; - } - token = nextToken(); - tokens_predicted.push_back(token); - if (token == word_tokens[match_token]) - { // the token follow the sequence - match_token++; - } - else if (match_token < word_tokens.size()) - { // no complete all word sequence - break; - } - } - } - } - if(as_loop) { - generated_text = ""; - } - for (llama_token tkn : tokens_predicted) - { - generated_text += llama_token_to_str(ctx, tkn); - } - return generated_text; - } - - std::vector embedding(std::string content, int threads) { - content.insert(0, 1, ' '); - std::vector tokens = ::llama_tokenize(ctx, content, true); - if (tokens.size() > 0) - { - if (llama_eval(ctx, tokens.data(), tokens.size(), 0, threads)) - { - fprintf(stderr, "%s : failed to eval\n", __func__); - std::vector embeddings_; - return embeddings_; - } - } - const int n_embd = llama_n_embd(ctx); - const auto embeddings = llama_get_embeddings(ctx); - std::vector embeddings_(embeddings, embeddings + n_embd); - return embeddings_; - } -}; +#ifndef SERVER_VERBOSE +#define SERVER_VERBOSE 1 +#endif using namespace httplib; - using json = nlohmann::json; -void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms) -{ - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); - fprintf(stderr, " --embedding enable embedding mode\n"); - fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - if (llama_mlock_supported()) - { - fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); - } - if (llama_mmap_supported()) - { - fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); - } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); - fprintf(stderr, " number of layers to store in VRAM\n"); - fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); - fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); -#endif - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); - fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n"); - fprintf(stderr, " --host ip address to listen (default 127.0.0.1)\n"); - fprintf(stderr, " --port PORT port to listen (default 8080)\n"); - fprintf(stderr, "\n"); +struct server_params { + std::string hostname = "127.0.0.1"; + int32_t port = 8080; + int32_t read_timeout = 600; + int32_t write_timeout = 600; +}; + +static size_t common_part(const std::vector & a, const std::vector & b) { + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} + return i; } -bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_params ¶ms) -{ - gpt_params default_params; - std::string arg; - bool invalid_param = false; +enum stop_type { + STOP_FULL, + STOP_PARTIAL, +}; - for (int i = 1; i < argc; i++) - { - arg = argv[i]; - if (arg == "--port") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - sparams.port = std::stoi(argv[i]); - } - else if (arg == "--host") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - sparams.hostname = argv[i]; - } - else if (arg == "-s" || arg == "--seed") - { -#if defined(GGML_USE_CUBLAS) - fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n"); -#endif - if (++i >= argc) - { - invalid_param = true; - break; - } - params.seed = std::stoi(argv[i]); - } - else if (arg == "-m" || arg == "--model") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.model = argv[i]; - } - else if (arg == "-a" || arg == "--alias") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.model_alias = argv[i]; - } - else if (arg == "--embedding") - { - params.embedding = true; - } - else if (arg == "-h" || arg == "--help") - { - server_print_usage(argc, argv, default_params); - exit(0); - } - else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); - } - else if (arg == "--memory-f32" || arg == "--memory_f32") - { - params.memory_f16 = false; - } - else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") - { - if (++i >= argc) - { - invalid_param = true; - break; - } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params.n_gpu_layers = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif - } - else if (arg == "--tensor-split" || arg == "-ts") - { - if (++i >= argc) - { - invalid_param = true; - break; - } -#ifdef GGML_USE_CUBLAS - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{R"([,/]+)"}; - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; - std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) - { - if (i < split_arg.size()) - { - params.tensor_split[i] = std::stof(split_arg[i]); - } - else - { - params.tensor_split[i] = 0.0f; - } - } -#else - fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); -#endif // GGML_USE_CUBLAS - } - else if (arg == "--low-vram" || arg == "-lv") - { -#ifdef GGML_USE_CUBLAS - params.low_vram = true; -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); -#endif // GGML_USE_CUBLAS - } - else if (arg == "--main-gpu" || arg == "-mg") - { - if (++i >= argc) - { - invalid_param = true; - break; - } -#ifdef GGML_USE_CUBLAS - params.main_gpu = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); -#endif - } - else - { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params); - exit(1); - } - } - - if (invalid_param) - { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - server_print_usage(argc, argv, default_params); - exit(1); - } - return true; +static bool ends_with(const std::string & str, const std::string & suffix) { + return str.size() >= suffix.size() && + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } -bool parse_options_completion(json body, llama_server_context& llama, Response &res) { - if (!body["threads"].is_null()) - { - llama.params.n_threads = body["threads"].get(); - } - if (!body["n_predict"].is_null()) - { - llama.params.n_predict = body["n_predict"].get(); - } - if (!body["top_k"].is_null()) - { - llama.params.top_k = body["top_k"].get(); - } - if (!body["top_p"].is_null()) - { - llama.params.top_p = body["top_p"].get(); - } - if (!body["temperature"].is_null()) - { - llama.params.temp = body["temperature"].get(); - } - if (!body["batch_size"].is_null()) - { - llama.params.n_batch = body["batch_size"].get(); - } - if (!body["n_keep"].is_null()) - { - llama.params.n_keep = body["n_keep"].get(); - } - if (!body["as_loop"].is_null()) - { - llama.as_loop = body["as_loop"].get(); - } - if (!body["interactive"].is_null()) - { - llama.params.interactive = body["interactive"].get(); - } - if (!body["prompt"].is_null()) - { - llama.params.prompt = body["prompt"].get(); - } - else - { - json data = { - {"status", "error"}, - {"reason", "You need to pass the prompt"}}; - res.set_content(data.dump(), "application/json"); - res.status = 400; - return false; - } - if (!body["stop"].is_null()) - { - std::vector stop_words = body["stop"].get>(); - for (std::string stop_word : stop_words) - { - llama.params.antiprompt.push_back(stop_word); - llama.no_show_words.push_back(::llama_tokenize(llama.ctx, stop_word, false)); - } - } - if (!body["exclude"].is_null()) - { - std::vector no_show_words = body["exclude"].get>(); - for (std::string no_show : no_show_words) - { - llama.no_show_words.push_back(::llama_tokenize(llama.ctx, no_show, false)); - } - } - return true; -} - -int main(int argc, char **argv) -{ - // own arguments required by this example - gpt_params params; - server_params sparams; - - // struct that contains llama context and inference - llama_server_context llama; - params.model = "ggml-model.bin"; - - if (server_params_parse(argc, argv, sparams, params) == false) - { - return 1; - } - - if (params.seed <= 0) - { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); - - // load the model - if (!llama.loadModel(params)) - { - return 1; - } - - Server svr; - - svr.Get("/", [](const Request &, Response &res) - { res.set_content("

llama.cpp server works

", "text/html"); }); - - svr.Post("/completion", [&llama](const Request &req, Response &res) - { - if(llama.params.embedding) { - json data = { - {"status", "error"}, - {"reason", "To use completion function disable embedding mode"}}; - res.set_content(data.dump(), "application/json"); - res.status = 400; - return; - } - - llama.rewind(); - - if(parse_options_completion(json::parse(req.body), llama, res) == false){ - return; - } - - if (!llama.loadPrompt()) - { - json data = { - {"status", "error"}, - {"reason", "Context too long, please be more specific"}}; - res.set_content(data.dump(), "application/json"); - res.status = 400; - return; - } - - llama.beginCompletion(); - if(llama.as_loop) { - json data = { - {"status", "done" } }; - return res.set_content(data.dump(), "application/json"); - } else { - // loop inference until finish completion - while (llama.has_next_token) - { - llama.doCompletion(); +static size_t find_partial_stop_string(const std::string & stop, + const std::string & text) { + if (!text.empty() && !stop.empty()) { + const char text_last_char = text.back(); + for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { + if (stop[char_index] == text_last_char) { + const std::string current_partial = stop.substr(0, char_index + 1); + if (ends_with(text, current_partial)) { + return text.size() - char_index - 1; } - try - { - json data = { - {"model", llama.params.model_alias }, - {"content", llama.generated_text }, - {"tokens_predicted", llama.num_tokens_predicted}}; - return res.set_content(data.dump(), "application/json"); - } - catch (const json::exception &e) - { - // Some tokens have bad UTF-8 strings, the json parser is very sensitive - json data = { - {"content", "Bad encoding token"}, - {"tokens_predicted", 0}}; - return res.set_content(data.dump(), "application/json"); - } - } }); - - svr.Post("/tokenize", [&llama](const Request &req, Response &res) - { - json body = json::parse(req.body); - json data = { - {"tokens", ::llama_tokenize(llama.ctx, body["content"].get(), false) } }; - return res.set_content(data.dump(), "application/json"); - }); - - svr.Post("/embedding", [&llama](const Request &req, Response &res) - { - if(!llama.params.embedding) { - std::vector empty; - json data = { - {"embedding", empty}}; - fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n"); - return res.set_content(data.dump(), "application/json"); - } - json body = json::parse(req.body); - std::string content = body["content"].get(); - int threads = body["threads"].get(); - json data = { - {"embedding", llama.embedding(content, threads) } }; - return res.set_content(data.dump(), "application/json"); - }); - - svr.Get("/next-token", [&llama](const Request &req, Response &res) - { - if(llama.params.embedding) { - res.set_content("{}", "application/json"); - return; } - std::string result = ""; - if (req.has_param("stop")) { - llama.has_next_token = false; + } + } + return std::string::npos; +} + +template +static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { + std::string ret; + for (; begin != end; ++begin) { + ret += llama_token_to_str(ctx, *begin); + } + return ret; +} + +static void server_log(const char * level, const char * function, int line, + const char * message, const nlohmann::ordered_json & extra) { + nlohmann::ordered_json log { + { "timestamp", time(nullptr) }, + { "level", level }, + { "function", function }, + { "line", line }, + { "message", message }, + }; + + if (!extra.empty()) { + log.merge_patch(extra); + } + + const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); + fprintf(stdout, "%.*s\n", (int)str.size(), str.data()); + fflush(stdout); +} + +static bool server_verbose = false; + +#if SERVER_VERBOSE != 1 +# define LOG_VERBOSE(MSG, ...) +#else +# define LOG_VERBOSE(MSG, ...) \ + do { \ + if (server_verbose) { \ + server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ + } \ + } while(0) +#endif + +#define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) + +struct llama_server_context { + bool stream = false; + bool has_next_token = false; + std::string generated_text; + + size_t num_tokens_predicted = 0; + size_t n_past = 0; + size_t n_remain = 0; + + std::vector embd; + std::vector last_n_tokens; + + llama_context * ctx = nullptr; + gpt_params params; + + bool truncated = false; + bool stopped_eos = false; + bool stopped_word = false; + bool stopped_limit = false; + std::string stopping_word; + int32_t multibyte_pending = 0; + + ~llama_server_context() { + if (ctx) { + llama_free(ctx); + ctx = nullptr; + } + } + + void rewind() { + params.antiprompt.clear(); + num_tokens_predicted = 0; + generated_text = ""; + generated_text.reserve(params.n_ctx); + truncated = false; + stopped_eos = false; + stopped_word = false; + stopped_limit = false; + stopping_word = ""; + multibyte_pending = 0; + + n_remain = 0; + n_past = 0; + } + + bool loadModel(const gpt_params & params_) { + params = params_; + ctx = llama_init_from_gpt_params(params); + if (ctx == nullptr) { + LOG_ERROR("unable to load model", { { "model", params_.model } }); + return false; + } + + last_n_tokens.resize(params.n_ctx); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + return true; + } + + void loadPrompt() { + params.prompt.insert(0, 1, ' '); // always add a first space + std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); + + if (params.n_keep < 0) { + params.n_keep = (int)prompt_tokens.size(); + } + params.n_keep = std::min(params.n_ctx - 4, params.n_keep); + + // if input prompt is too big, truncate like normal + if (prompt_tokens.size() >= (size_t)params.n_ctx) { + const int n_left = (params.n_ctx - params.n_keep) / 2; + std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); + const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_left - 1) / n_left; + new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end()); + std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin()); + + LOG_VERBOSE("input truncated", { + { "n_ctx", params.n_ctx }, + { "n_keep", params.n_keep }, + { "n_left", n_left }, + { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) }, + }); + + truncated = true; + prompt_tokens = new_tokens; + } else { + const size_t ps = prompt_tokens.size(); + std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0); + std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps); + } + + // compare the evaluated prompt with the new prompt + n_past = common_part(embd, prompt_tokens); + embd = prompt_tokens; + if (n_past == prompt_tokens.size()) { + // we have to evaluate at least 1 token to generate logits. + n_past--; + } + + LOG_VERBOSE("prompt ingested", { + { "n_past", n_past }, + { "cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past) }, + { "to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) }, + }); + + has_next_token = true; + } + + void beginCompletion() { + // number of tokens to keep when resetting context + n_remain = params.n_predict; + llama_set_rng_seed(ctx, params.seed); + } + + llama_token nextToken() { + llama_token result = -1; + + if (embd.size() >= (size_t)params.n_ctx) { + // Reset context + const int n_left = (params.n_ctx - params.n_keep) / 2; + + std::vector new_tokens(embd.begin(), embd.begin() + params.n_keep); + new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end()); + embd = new_tokens; + n_past = params.n_keep; + truncated = true; + LOG_VERBOSE("input truncated", { + { "n_ctx", params.n_ctx }, + { "n_keep", params.n_keep }, + { "n_left", n_left }, + { "new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()) }, + }); + } + + while (n_past < embd.size()) { + int n_eval = (int)embd.size() - n_past; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads)) { + LOG_ERROR("failed to eval", { + { "n_eval", n_eval }, + { "n_past", n_past }, + { "n_threads", params.n_threads }, + { "embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()) }, + }); + has_next_token = false; + return result; + } + n_past += n_eval; + } + + // out of user input, sample next token + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + const float alpha_presence = params.presence_penalty; + const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; + llama_token id = 0; + + { + auto * logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); + + // Apply params.logit_bias map + for (const auto & it : params.logit_bias) { + logits[it.first] += it.second; + } + + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // Apply penalties + float nl_logit = logits[llama_token_nl()]; + auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx); + llama_sample_repetition_penalty(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, repeat_penalty); + llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, alpha_frequency, alpha_presence); + if (!penalize_nl) { + logits[llama_token_nl()] = nl_logit; + } + + if (temp <= 0) { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &candidates_p); } else { - result = llama.doCompletion(); // inference next token + if (mirostat == 1) { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token(ctx, &candidates_p); + } } - try { - json data = { - {"content", result }, - {"stop", !llama.has_next_token }}; - return res.set_content(data.dump(), "application/json"); - } catch (const json::exception &e) { - // Some tokens have bad UTF-8 strings, the json parser is very sensitive - json data = { - {"content", "" }, - {"stop", !llama.has_next_token }}; - return res.set_content(data.dump(), "application/json"); + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + num_tokens_predicted++; + } + + // add it to the context + embd.push_back(id); + result = id; + // decrement remaining sampling budget + --n_remain; + + if (!embd.empty() && embd.back() == llama_token_eos()) { + //stopping_word = llama_token_to_str(ctx, embd.back()); + has_next_token = false; + stopped_eos = true; + LOG_VERBOSE("eos token found", {}); + return result; + } + + has_next_token = params.n_predict == -1 || n_remain != 0; + return result; + } + + size_t findStoppingStrings(const std::string & text, const size_t last_token_size, + const stop_type type) { + size_t stop_pos = std::string::npos; + for (const std::string & word : params.antiprompt) { + size_t pos; + if (type == STOP_FULL) { + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + pos = text.find(word, from_pos); } - }); + else { + pos = find_partial_stop_string(word, text); + } + if (pos != std::string::npos && + (stop_pos == std::string::npos || pos < stop_pos)) { + if (type == STOP_FULL) { + stopping_word = word; + stopped_word = true; + has_next_token = false; + } + stop_pos = pos; + } + } + return stop_pos; + } - fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port); + std::string doCompletion() { + const llama_token token = nextToken(); - if(params.embedding) { - fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n"); - } + const std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token); + generated_text += token_text; - // change hostname and port - svr.listen(sparams.hostname, sparams.port); + if (multibyte_pending > 0) { + multibyte_pending -= token_text.size(); + } else if (token_text.size() == 1) { + const char c = token_text[0]; + // 2-byte characters: 110xxxxx 10xxxxxx + if ((c & 0xE0) == 0xC0) { + multibyte_pending = 1; + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF0) == 0xE0) { + multibyte_pending = 2; + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF8) == 0xF0) { + multibyte_pending = 3; + } else { + multibyte_pending = 0; + } + } + + if (multibyte_pending > 0 && !has_next_token) { + has_next_token = true; + n_remain++; + } + + if (!has_next_token && n_remain == 0) { + stopped_limit = true; + } + + LOG_VERBOSE("next token", { + { "token", token }, + { "token_text", llama_token_to_str(ctx, token) }, + { "has_next_token", has_next_token }, + { "n_remain", n_remain }, + { "num_tokens_predicted", num_tokens_predicted }, + { "stopped_eos", stopped_eos }, + { "stopped_word", stopped_word }, + { "stopped_limit", stopped_limit }, + { "stopping_word", stopping_word }, + }); + + return token_text; + } +}; + +static void server_print_usage(const char * argv0, const gpt_params & params, + const server_params & sparams) { + fprintf(stderr, "usage: %s [options]\n", argv0); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); + if (llama_mlock_supported()) { + fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + if (llama_mmap_supported()) { + fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); + fprintf(stderr, " number of layers to store in VRAM\n"); + fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); +#endif + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); + fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n"); + fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); + fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); + fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + fprintf(stderr, "\n"); +} + +static void server_params_parse(int argc, char ** argv, server_params & sparams, + gpt_params & params) { + gpt_params default_params; + server_params default_sparams; + std::string arg; + bool invalid_param = false; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg == "--port") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.port = std::stoi(argv[i]); + } else if (arg == "--host") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.hostname = argv[i]; + } else if (arg == "--timeout" || arg == "-to") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.read_timeout = std::stoi(argv[i]); + sparams.write_timeout = std::stoi(argv[i]); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_alias = argv[i]; + } else if (arg == "-h" || arg == "--help") { + server_print_usage(argv[0], default_params, default_sparams); + exit(0); + } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--memory-f32" || arg == "--memory_f32") { + params.memory_f16 = false; + } else if (arg == "--threads" || arg == "-t") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params.n_gpu_layers = std::stoi(argv[i]); +#else + LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. " + "See main README.md for information on enabling GPU BLAS support", { { "n_gpu_layers", params.n_gpu_layers } }); +#endif + } + else if (arg == "--tensor-split" || arg == "-ts") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + + for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) { + if (i_device < split_arg.size()) { + params.tensor_split[i_device] = std::stof(split_arg[i_device]); + } + else { + params.tensor_split[i_device] = 0.0f; + } + } +#else + LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {}); +#endif // GGML_USE_CUBLAS + } + else if (arg == "--low-vram" || arg == "-lv") + { +#ifdef GGML_USE_CUBLAS + params.low_vram = true; +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); +#endif // GGML_USE_CUBLAS + } + else if (arg == "--main-gpu" || arg == "-mg") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + params.main_gpu = std::stoi(argv[i]); +#else + LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); +#endif + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter = argv[i]; + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; + } else if (arg == "-v" || arg == "--verbose") { +#if SERVER_VERBOSE != 1 + LOG_WARNING("server.cpp is not built with verbose logging.", {}); +#else + server_verbose = true; +#endif + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--no-mmap") { + params.use_mmap = false; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } + } + + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } +} + +static json format_generation_settings(llama_server_context & llama) { + const auto eos_bias = llama.params.logit_bias.find(llama_token_eos()); + const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && + eos_bias->second < 0.0f && std::isinf(eos_bias->second); + + return json { + { "seed", llama.params.seed }, + { "temp", llama.params.temp }, + { "top_k", llama.params.top_k }, + { "top_p", llama.params.top_p }, + { "tfs_z", llama.params.tfs_z }, + { "typical_p", llama.params.typical_p }, + { "repeat_last_n", llama.params.repeat_last_n }, + { "repeat_penalty", llama.params.repeat_penalty }, + { "presence_penalty", llama.params.presence_penalty }, + { "frequency_penalty", llama.params.frequency_penalty }, + { "mirostat", llama.params.mirostat }, + { "mirostat_tau", llama.params.mirostat_tau }, + { "mirostat_eta", llama.params.mirostat_eta }, + { "penalize_nl", llama.params.penalize_nl }, + { "stop", llama.params.antiprompt }, + { "n_predict", llama.params.n_predict }, + { "n_keep", llama.params.n_keep }, + { "ignore_eos", ignore_eos }, + { "stream", llama.stream }, + { "logit_bias", llama.params.logit_bias }, + }; +} + +static json format_final_response(llama_server_context & llama, const std::string & content) { + return json { + { "content", content }, + { "stop", true }, + { "model", llama.params.model_alias }, + { "tokens_predicted", llama.num_tokens_predicted }, + { "generation_settings", format_generation_settings(llama) }, + { "prompt", llama.params.prompt }, + { "truncated", llama.truncated }, + { "stopped_eos", llama.stopped_eos }, + { "stopped_word", llama.stopped_word }, + { "stopped_limit", llama.stopped_limit }, + { "stopping_word", llama.stopping_word }, + }; +} + +static json format_partial_response(const std::string & content) { + return json { + { "content", content }, + { "stop", false }, + }; +} + +static json format_tokenizer_response(const std::vector & tokens) { + return json { + { "tokens", tokens } + }; +} + +static void parse_options_completion(const json & body, llama_server_context & llama) { + gpt_params default_params; + + llama.stream = body.value("stream", false); + llama.params.n_predict = body.value("n_predict", default_params.n_predict); + llama.params.top_k = body.value("top_k", default_params.top_k); + llama.params.top_p = body.value("top_p", default_params.top_p); + llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z); + llama.params.typical_p = body.value("typical_p", default_params.typical_p); + llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n); + llama.params.temp = body.value("temperature", default_params.temp); + llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty); + llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty); + llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty); + llama.params.mirostat = body.value("mirostat", default_params.mirostat); + llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau); + llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta); + llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl); + llama.params.n_keep = body.value("n_keep", default_params.n_keep); + llama.params.seed = body.value("seed", default_params.seed); + llama.params.prompt = body.value("prompt", default_params.prompt); + + llama.params.logit_bias.clear(); + if (body.value("ignore_eos", false)) { + llama.params.logit_bias[llama_token_eos()] = -INFINITY; + } + + const auto & logit_bias = body.find("logit_bias"); + if (logit_bias != body.end() && logit_bias->is_array()) { + const int n_vocab = llama_n_vocab(llama.ctx); + for (const auto & el : *logit_bias) { + if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) { + if (el[1].is_number()) { + llama.params.logit_bias[tok] = el[1].get(); + } else if (el[1].is_boolean() && !el[1].get()) { + llama.params.logit_bias[tok] = -INFINITY; + } + } + } + } + } + + llama.params.antiprompt.clear(); + const auto & stop = body.find("stop"); + if (stop != body.end() && stop->is_array()) { + for (const auto & word : *stop) { + if (!word.empty()) { + llama.params.antiprompt.push_back(word); + } + } + } + + LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama)); +} + +static void log_server_request(const Request & req, const Response & res) { + LOG_INFO("request", { + { "remote_addr", req.remote_addr }, + { "remote_port", req.remote_port }, + { "status", res.status }, + { "path", req.path }, + { "request", req.body }, + { "response", res.body }, + }); +} + +int main(int argc, char ** argv) { + // own arguments required by this example + gpt_params params; + server_params sparams; + + // struct that contains llama context and inference + llama_server_context llama; + + server_params_parse(argc, argv, sparams, params); + + if (params.model_alias == "unknown") { + params.model_alias = params.model; + } + + llama_init_backend(); + + LOG_INFO("build info", { + { "build", BUILD_NUMBER }, + { "commit", BUILD_COMMIT } + }); + LOG_INFO("system info", { + { "n_threads", params.n_threads }, + { "total_threads", std::thread::hardware_concurrency() }, + { "system_info", llama_print_system_info() }, + }); + + // load the model + if (!llama.loadModel(params)) { + return 1; + } + + Server svr; + + svr.set_default_headers({ + { "Access-Control-Allow-Origin", "*" }, + { "Access-Control-Allow-Headers", "content-type" } + }); + + svr.Get("/", [](const Request &, Response & res) { + res.set_content("

llama.cpp server works

", "text/html"); + }); + + svr.Post("/completion", [&llama](const Request & req, Response & res) { + llama.rewind(); + llama_reset_timings(llama.ctx); + + parse_options_completion(json::parse(req.body), llama); + + llama.loadPrompt(); + llama.beginCompletion(); + + if (!llama.stream) { + size_t stop_pos = std::string::npos; + + while (llama.has_next_token) { + const std::string token_text = llama.doCompletion(); + + stop_pos = llama.findStoppingStrings(llama.generated_text, + token_text.size(), STOP_FULL); + } + + if (stop_pos == std::string::npos) { + stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); + } + if (stop_pos != std::string::npos) { + llama.generated_text.erase(llama.generated_text.begin() + stop_pos, + llama.generated_text.end()); + } + + const json data = format_final_response(llama, llama.generated_text); + + llama_print_timings(llama.ctx); + + res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), + "application/json"); + } else { + const auto chunked_content_provider = [&](size_t, DataSink & sink) { + size_t sent_count = 0; + + while (llama.has_next_token) { + const std::string token_text = llama.doCompletion(); + if (llama.multibyte_pending > 0) { + continue; + } + + size_t pos = std::min(sent_count, llama.generated_text.size()); + + const std::string str_test = llama.generated_text.substr(pos); + size_t stop_pos = + llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL); + if (stop_pos != std::string::npos) { + llama.generated_text.erase( + llama.generated_text.begin() + pos + stop_pos, + llama.generated_text.end()); + pos = std::min(sent_count, llama.generated_text.size()); + } else { + stop_pos = llama.findStoppingStrings(str_test, token_text.size(), + STOP_PARTIAL); + } + + const std::string to_send = llama.generated_text.substr(pos, stop_pos); + sent_count += to_send.size(); + + const json data = llama.has_next_token + ? format_partial_response(to_send) + // Generation is done, send extra information. + : format_final_response(llama, to_send); + + const std::string str = + "data: " + + data.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + + LOG_VERBOSE("data stream", { + { "to_send", str } + }); + + if (!sink.write(str.data(), str.size())) { + LOG_VERBOSE("stream closed", {}); + llama_print_timings(llama.ctx); + return false; + } + } + + llama_print_timings(llama.ctx); + sink.done(); + return true; + }; + res.set_chunked_content_provider("text/event-stream", chunked_content_provider); + } + }); + + svr.Options(R"(/.*)", [](const Request &, Response & res) { + return res.set_content("", "application/json"); + }); + + svr.Post("/tokenize", [&llama](const Request & req, Response & res) { + const json body = json::parse(req.body); + const std::string content = body["content"].get(); + const std::vector tokens = llama_tokenize(llama.ctx, content, false); + const json data = format_tokenizer_response(tokens); + return res.set_content(data.dump(), "application/json"); + }); + + svr.set_logger(log_server_request); + + svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) { + const auto * fmt = "500 Internal Server Error\n%s"; + char buf[BUFSIZ]; + try { + std::rethrow_exception(std::move(ep)); + } catch (std::exception & e) { + snprintf(buf, sizeof(buf), fmt, e.what()); + } catch (...) { + snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); + } + res.set_content(buf, "text/plain"); + res.status = 500; + }); + + // set timeouts and change hostname and port + svr.set_read_timeout(sparams.read_timeout); + svr.set_write_timeout(sparams.write_timeout); + + if (!svr.bind_to_port(sparams.hostname, sparams.port)) { + LOG_ERROR("couldn't bind to server socket", { + { "hostname", sparams.hostname }, + { "port", sparams.port }, + }); + return 1; + } + + LOG_INFO("HTTP server listening", { + { "hostname", sparams.hostname }, + { "port", sparams.port }, + }); + + if (!svr.listen_after_bind()) { + return 1; + } + + return 0; } diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt new file mode 100644 index 000000000..1568f7364 --- /dev/null +++ b/examples/simple/CMakeLists.txt @@ -0,0 +1,7 @@ +set(TARGET simple) +add_executable(${TARGET} simple.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp new file mode 100644 index 000000000..76f991cdc --- /dev/null +++ b/examples/simple/simple.cpp @@ -0,0 +1,177 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "common.h" +#include "llama.h" +#include "build-info.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#include +#endif + + + +int main(int argc, char ** argv) +{ + gpt_params params; + + //--------------------------------- + // Print help : + //--------------------------------- + + if ( argc == 1 || argv[1][0] == '-' ) + { + printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] ); + return 1 ; + } + + //--------------------------------- + // Load parameters : + //--------------------------------- + + if ( argc >= 2 ) + { + params.model = argv[1]; + } + + if ( argc >= 3 ) + { + params.prompt = argv[2]; + } + + if ( params.prompt.empty() ) + { + params.prompt = "Hello my name is"; + } + + //--------------------------------- + // Init LLM : + //--------------------------------- + + llama_init_backend(); + + llama_context * ctx ; + + ctx = llama_init_from_gpt_params( params ); + + if ( ctx == NULL ) + { + fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); + return 1; + } + + //--------------------------------- + // Tokenize the prompt : + //--------------------------------- + + std::vector tokens_list; + tokens_list = ::llama_tokenize( ctx , params.prompt , true ); + + const int max_context_size = llama_n_ctx( ctx ); + const int max_tokens_list_size = max_context_size - 4 ; + + if ( (int)tokens_list.size() > max_tokens_list_size ) + { + fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" , + __func__ , (int)tokens_list.size() , max_tokens_list_size ); + return 1; + } + + fprintf( stderr, "\n\n" ); + + // Print the tokens from the prompt : + + for( auto id : tokens_list ) + { + printf( "%s" , llama_token_to_str( ctx , id ) ); + } + + fflush(stdout); + + + //--------------------------------- + // Main prediction loop : + //--------------------------------- + + // The LLM keeps a contextual cache memory of previous token evaluation. + // Usually, once this cache is full, it is required to recompute a compressed context based on previous + // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist + // example, we will just stop the loop once this cache is full or once an end of stream is detected. + + while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) + { + //--------------------------------- + // Evaluate the tokens : + //--------------------------------- + + if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) + { + fprintf( stderr, "%s : failed to eval\n" , __func__ ); + return 1; + } + + tokens_list.clear(); + + //--------------------------------- + // Select the best prediction : + //--------------------------------- + + llama_token new_token_id = 0; + + auto logits = llama_get_logits( ctx ); + auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens) + + std::vector candidates; + candidates.reserve( n_vocab ); + + for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ ) + { + candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } ); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // Select it using the "Greedy sampling" method : + new_token_id = llama_sample_token_greedy( ctx , &candidates_p ); + + + // is it an end of stream ? + if ( new_token_id == llama_token_eos() ) + { + fprintf(stderr, " [end of text]\n"); + break; + } + + // Print the new token : + printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); + fflush( stdout ); + + // Push this new token for next evaluation : + tokens_list.push_back( new_token_id ); + + } // wend of main loop + + llama_free( ctx ); + + return 0; +} + +// EOF diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md index 5344d1f52..726ec47c0 100644 --- a/examples/train-text-from-scratch/README.md +++ b/examples/train-text-from-scratch/README.md @@ -4,7 +4,7 @@ Basic usage instructions: ```bash # get training data -wget https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt +wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt # train ./bin/train-text-from-scratch \ diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 51271b497..7ec85951a 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -12,6 +12,9 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif struct random_normal_distribution { std::mt19937 gen; @@ -20,7 +23,6 @@ struct random_normal_distribution { float max; }; - struct random_uniform_distribution { std::mt19937 gen; std::uniform_real_distribution rd; @@ -2366,7 +2368,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { file->write_u32(0); file->write_u32(0); file->write_u32(GGML_TYPE_F32); - file->seek(-file->tell() & 31, SEEK_CUR); + file->seek(0-file->tell() & 31, SEEK_CUR); return; } const char * name = ggml_get_name(tensor); @@ -2381,7 +2383,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { file->write_u32(tensor->type); file->write_raw(ne, sizeof(ne[0]) * nd); file->write_raw(name, name_len); - file->seek(-file->tell() & 31, SEEK_CUR); + file->seek(0-file->tell() & 31, SEEK_CUR); file->write_raw(tensor->data, ggml_nbytes(tensor)); } @@ -2402,7 +2404,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) { std::string name = file->read_string(name_len); GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0); - file->seek(-file->tell() & 31, SEEK_CUR); + file->seek(0-file->tell() & 31, SEEK_CUR); file->read_raw(tensor->data, ggml_nbytes(tensor)); } @@ -2756,8 +2758,8 @@ struct train_params get_default_train_params() { params.lbfgs_n_iter = 16; params.adam_n_iter = 16; - params.adam_alpha = 1e-3; - params.adam_decay = 1e-3; + params.adam_alpha = 1e-3f; + params.adam_decay = 1e-3f; params.mem_model_gb = 2; params.mem_compute_gb = 24; @@ -3331,8 +3333,8 @@ int main(int argc, char ** argv) { int n_gen = params.n_predict; int sample_ctx = n_tokens - n_tokens/8; - sampler.params.temp = 0.2; - sampler.params.repeat_penalty = 1.1; + sampler.params.temp = 0.2f; + sampler.params.repeat_penalty = 1.1f; sampler.params.mirostat = 2; init_sampler(&sampler, lctx); diff --git a/flake.nix b/flake.nix index f3180c841..bba3d71f7 100644 --- a/flake.nix +++ b/flake.nix @@ -48,6 +48,19 @@ ''; meta.mainProgram = "llama"; }; + apps.llama-server = { + type = "app"; + program = "${self.packages.${system}.default}/bin/llama-server"; + }; + apps.llama-embedding = { + type = "app"; + program = "${self.packages.${system}.default}/bin/embedding"; + }; + apps.llama = { + type = "app"; + program = "${self.packages.${system}.default}/bin/llama"; + }; + apps.default = self.apps.${system}.llama; devShells.default = pkgs.mkShell { packages = with pkgs; [ cmake diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bd89d0a1f..16488b9f9 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -13,6 +13,10 @@ #include "ggml-cuda.h" #include "ggml.h" +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); #define CUDA_CHECK(err) \ @@ -167,6 +171,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define GGML_CUDA_DMMV_Y 1 #endif +#ifndef K_QUANTS_PER_ITERATION +#define K_QUANTS_PER_ITERATION 2 +#else +static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); +#endif + static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -326,37 +336,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { } -static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - - const block_q2_K * x = (const block_q2_K *) vx; - - // if n is 0, we want to do the lower 128, else the upper 128, - // covering y[l+0], y[l+32], y[l+64], y[l+96] and - // y[l+16], y[l+48], y[l+80], y[l+112] - int n = iqs/128; // 0 or 1 - int r = iqs - 128*n; // 0...120 in steps of 8 - int l = r/8; // 0...15 in steps of 1 - - const float * y = yy + 128*n + l; - const uint8_t * q = x[ib].qs + 32*n + l; - const uint8_t * s = x[ib].scales + 8*n; - - const float dall = x[ib].d; - const float dmin = x[ib].dmin; - - float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4)) - + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4)) - + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4)) - + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4)) - + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4)) - + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4)) - + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4)) - + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4)); - - result = sum; - -} - static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { int r = threadIdx.x/4; @@ -388,51 +367,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { } -static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - - const block_q3_K * x = (const block_q3_K *) vx; - - const uint32_t kmask1 = 0x03030303; - const uint32_t kmask2 = 0x0f0f0f0f; - - uint32_t aux[3]; - uint32_t utmp[4]; - - // if n is 0, we want to do the lower 128, else the upper 128, - // covering y[l+0], y[l+32], y[l+64], y[l+96] and - // y[l+16], y[l+48], y[l+80], y[l+112] - int n = iqs/128; // 0 or 1 - int r = iqs - 128*n; // 0...120 in steps of 8 - int l = r/8; // 0...15 in steps of 1 - - const float * y = yy + 128*n + l; - const uint8_t * q = x[ib].qs + 32*n + l; - const uint8_t * hm = x[ib].hmask + l; - const int8_t * s = (const int8_t *)utmp + 8*n; - - memcpy(aux, x[ib].scales, 12); - utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); - utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); - utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); - utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); - - const float dall = x[ib].d; - - const uint8_t m = 1 << (4*n); - - float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4)) - + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4)) - + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4)) - + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4)) - + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4)) - + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4)) - + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4)) - + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4)); - - result = sum * dall; - -} - static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { if (j < 4) { d = q[j] & 63; m = q[j + 4] & 63; @@ -479,38 +413,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { } } -static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - - const block_q4_K * x = (const block_q4_K *) vx; - - // iqs is in 0...248 in steps of 8 => - const int j = iqs / 64; // j is in 0...3 - const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4 - const int is = 2*j; // is is in 0...6 in steps of 2 - - const float * y = yy + 64*j + ir; - const uint8_t * q = x[ib].qs + 32*j + ir; - - const float dall = x[ib].d; - const float dmin = x[ib].dmin; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[ib].scales, sc, m); - const float d1 = dall * sc; - const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[ib].scales, sc, m); - const float d2 = dall * sc; - const float m2 = dmin * m; - - float sum = 0; - for (int k = 0; k < 4; ++k) { - sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1); - sum += y[k + 32] * (d2 * (q[k] >> 4) - m2); - } - result = sum; - -} - static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { const block_q5_K * x = (const block_q5_K *) vx; @@ -544,43 +446,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; } -static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - - const block_q5_K * x = (const block_q5_K *) vx; - - // iqs is in 0...248 in steps of 8 => - const int j = iqs / 64; // j is in 0...3 - const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4 - const int is = 2*j; // is is in 0...6 in steps of 2 - - const float * y = yy + 64*j + ir; - const uint8_t * ql = x[ib].qs + 32*j + ir; - const uint8_t * qh = x[ib].qh + ir; - - const float dall = x[ib].d; - const float dmin = x[ib].dmin; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[ib].scales, sc, m); - const float d1 = dall * sc; - const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[ib].scales, sc, m); - const float d2 = dall * sc; - const float m2 = dmin * m; - - uint8_t hm = 1 << is; - float sum = 0; - for (int k = 0; k < 4; ++k) { - sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1); - } - hm <<= 1; - for (int k = 0; k < 4; ++k) { - sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2); - } - result = sum; - -} - static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { const block_q6_K * x = (const block_q6_K *) vx; @@ -606,31 +471,376 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); } -static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { - const block_q6_K * x = (const block_q6_K *) vx; + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - const int ip = iqs / 128; // 0 or 1 - const int il = (iqs - 128*ip)/8; // 0...15 - const int is = 8*ip; + const int row = blockIdx.y*blockDim.y + threadIdx.y; + if (row > nrows) return; - const float * y = yy + 128*ip + il; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; - const float d = x[ib].d; + const block_q2_K * x = (const block_q2_K *)vx + ib0; - const uint8_t * ql = x[ib].ql + 64*ip + il; - const uint8_t * qh = x[ib].qh + 32*ip + il; - const int8_t * sc = x[ib].scales + is; + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 - result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32) - + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32) - + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32) - + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32) - + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32) - + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32) - + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32) - + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32); + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...7 + + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...14 in steps of 4 + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + float tmp = 0; // partial sum for thread in warp + + uint32_t aux[4]; + const uint8_t * d = (const uint8_t *)aux; + const uint8_t * m = (const uint8_t *)(aux + 2); + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const float dall = x[i].d; + const float dmin = x[i].dmin; + + const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + + } + tmp += dall * sum1 - dmin * sum2; + + } + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols) { + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int row = blockIdx.x; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q3_K * x = (const block_q3_K *)vx + ib0; + + const int tid = threadIdx.x/2; // 0...15 + const int ix = threadIdx.x%2; // 0, 1 + + const int n = 2; // iterations in the inner loop + const int im = tid/8; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - 8*im; // 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...28 in steps of 4 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + const uint8_t * h = x[i].hmask + l0; + + const uint16_t * a = (const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = x[i].d; + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; + + } + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols) { + + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int row = blockIdx.x; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const int tid = threadIdx.x/2; // 0...15 + const int ix = threadIdx.x%2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 4; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + const block_q4_K * x = (const block_q4_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * q1 = x[i].qs + q_offset; + const uint8_t * q2 = q1 + 64; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].d; + const float dmin = x[i].dmin; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < n; ++l) { + s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4); + s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4); + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; + + } + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) { + + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + //const int row = blockIdx.x*blockDim.y + threadIdx.y; + const int row = blockIdx.x; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const int tid = threadIdx.x/2; // 0...15 + const int ix = threadIdx.x%2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 4; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + const block_q5_K * x = (const block_q5_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * ql1 = x[i].qs + q_offset; + const uint8_t * ql2 = ql1 + 64; + const uint8_t * qh = x[i].qh + l0; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].d; + const float dmin = x[i].dmin; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + float4 sum = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < n; ++l) { + sum.x += y1[l+ 0] * ((ql1[l] & 0xF) + (qh[l] & (hm1 << 0) ? 16 : 0)); + sum.y += y1[l+32] * ((ql1[l] >> 4) + (qh[l] & (hm1 << 1) ? 16 : 0)); + sum.z += y2[l+ 0] * ((ql2[l] & 0xF) + (qh[l] & (hm2 << 0) ? 16 : 0)); + sum.w += y2[l+32] * ((ql2[l] >> 4) + (qh[l] & (hm2 << 1) ? 16 : 0)); + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; + + } + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = blockIdx.y*blockDim.y + threadIdx.y; + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q6_K * x = (const block_q6_K *)vx + ib0; + + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + +#if K_QUANTS_PER_ITERATION == 1 + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 + const int is = 0; +#else + const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int is = in / 4; +#endif + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = x[i].ql + ql_offset; + const uint8_t * qh = x[i].qh + qh_offset; + const int8_t * s = x[i].scales + s_offset; + + const float d = x[i].d; + +#if K_QUANTS_PER_ITERATION == 1 + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; +#else + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; +#endif + + } + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } } static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){ @@ -712,46 +922,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, } } -template -static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) { - const int row = blockIdx.y*blockDim.y + threadIdx.y; - - if (row >= nrows) { - return; - } - - const int tid = threadIdx.x; - - const int iter_stride = QK_K; - const int vals_per_iter = iter_stride / n_thread; - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - float tmp = 0; // partial sum for thread in warp - - for (int i = 0; i < ncols; i += iter_stride) { - const int col = i + vals_per_iter*tid; - const int ib = ib0 + col/QK_K; // x block index - const int iqs = col%QK_K; // x quant index - const int iybs = col - col%QK_K; // y block start index - - float v; - dot_kernel(vx, ib, iqs, y + iybs, v); - tmp += v; - } - - // sum up partial sums and write back result - __syncthreads(); -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); - } - - if (tid == 0) { - dst[row] = tmp; - } -} - static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) { const half * x = (half *) vx; @@ -1094,43 +1264,34 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f const int block_num_y = (nrows + ny - 1) / ny; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<>>(vx, y, dst, ncols, nrows); + dequantize_mul_mat_vec_q2_k<<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; - const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(1, block_num_y, 1); - const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<>>(vx, y, dst, ncols, nrows); + const dim3 block_dims(32, 1, 1); + dequantize_mul_mat_vec_q3_k<<>>(vx, y, dst, ncols); } static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; - const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(1, block_num_y, 1); - const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<>>(vx, y, dst, ncols, nrows); + const dim3 block_dims(32, 1, 1); + dequantize_mul_mat_vec_q4_k<<>>(vx, y, dst, ncols); } static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; - const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(1, block_num_y, 1); - const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<>>(vx, y, dst, ncols, nrows); + const dim3 block_dims(32, 1, 1); + dequantize_mul_mat_vec_q5_k<<>>(vx, y, dst, ncols); } static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; + const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<>>(vx, y, dst, ncols, nrows); + dequantize_mul_mat_vec_q6_k<<>>(vx, y, dst, ncols, nrows); } static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { @@ -1306,19 +1467,13 @@ static void * g_scratch_buffer = nullptr; static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default static size_t g_scratch_offset = 0; -#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication. -#define GGML_CUDA_MAX_EVENTS 64 - static int g_device_count = -1; static int g_main_device = 0; static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; -static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr }; - -static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr }; -static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr }; +static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr }; void ggml_init_cublas() { static bool initialized = false; @@ -1342,15 +1497,8 @@ void ggml_init_cublas() { for (int id = 0; id < g_device_count; ++id) { CUDA_CHECK(cudaSetDevice(id)); - // create streams - for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) { - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking)); - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking)); - } - // create events - for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) { - CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming)); - } + // create main stream + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking)); // create cublas handle CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); @@ -1817,6 +1965,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0}; size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0}; + // if multiple GPUs are used they need to wait for the main GPU to finish + if (split && g_device_count > 1) { + CUDA_CHECK(cudaSetDevice(g_main_device)); + CUDA_CHECK(cudaDeviceSynchronize()); + } + for (int id = 0; id < g_device_count; ++id) { if (!split && id != g_main_device) { continue; @@ -1915,9 +2069,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm } const int64_t i11 = i13*ne12 + i12; - cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS]; - cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS]; - cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS]; + cudaStream_t cudaStream_main = g_cudaStreams_main[id]; // for split tensors the data begins at i0 == i0_offset_low char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs; @@ -1945,14 +2097,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm if (src1->backend == GGML_BACKEND_CPU) { GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1)); int64_t nrows1 = flatten_rows ? nrows0 : ne11; - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main)); } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { if (id != g_main_device) { GGML_ASSERT(!flatten_rows); float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device]; src1_ddf_i_source += i11*src1_stride; CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float), - cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1)); + cudaMemcpyDeviceToDevice, cudaStream_main)); } } else if (src1_on_device && !src1_is_contiguous) { GGML_ASSERT(!split); @@ -1961,7 +2113,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm GGML_ASSERT(false); } } - CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1)); if (!src0_on_device || !src0_is_contiguous) { if (src0_is_f32) { @@ -1977,9 +2128,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm CUDA_CHECK(cudaGetLastError()); } - // wait with main stream until src1 memcpy is done - CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0)); - // do the computation op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main); @@ -2017,8 +2165,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm // wait until each device is finished, then free their buffers for (int id = 0; id < g_device_count; ++id) { + if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) { + continue; + } + CUDA_CHECK(cudaSetDevice(id)); CUDA_CHECK(cudaDeviceSynchronize()); + if (src0_asq[id] > 0) { ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]); } @@ -2084,7 +2237,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr const int64_t ne02 = src0->ne[2]; CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0]; + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; @@ -2096,8 +2249,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main); - - CUDA_CHECK(cudaDeviceSynchronize()); } void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ @@ -2115,7 +2266,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 const int64_t nb02 = src0->nb[2]; CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0]; + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; @@ -2130,8 +2281,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 const int channel_stride_x = nb02 / sizeof(half); ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main); - - CUDA_CHECK(cudaDeviceSynchronize()); } void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -2187,7 +2336,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens const int64_t nb12 = src1->nb[2]; CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0]; + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; @@ -2205,8 +2354,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens GGML_ASSERT(false); } - CUDA_CHECK(cudaDeviceSynchronize()); - (void) dst; } diff --git a/ggml-metal.m b/ggml-metal.m index 7fa5393c9..e476fba2b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -57,6 +57,7 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(get_rows_q5_k); GGML_METAL_DECL_KERNEL(get_rows_q6_k); GGML_METAL_DECL_KERNEL(rms_norm); + GGML_METAL_DECL_KERNEL(norm); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); @@ -66,8 +67,10 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32); GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32); GGML_METAL_DECL_KERNEL(rope); + GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f32); + GGML_METAL_DECL_KERNEL(cpy_f16_f16); #undef GGML_METAL_DECL_KERNEL }; @@ -162,6 +165,7 @@ struct ggml_metal_context * ggml_metal_init(void) { GGML_METAL_ADD_KERNEL(get_rows_q5_k); GGML_METAL_ADD_KERNEL(get_rows_q6_k); GGML_METAL_ADD_KERNEL(rms_norm); + GGML_METAL_ADD_KERNEL(norm); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); @@ -171,8 +175,10 @@ struct ggml_metal_context * ggml_metal_init(void) { GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32); GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32); GGML_METAL_ADD_KERNEL(rope); + GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f32); + GGML_METAL_ADD_KERNEL(cpy_f16_f16); #undef GGML_METAL_ADD_KERNEL } @@ -225,7 +231,8 @@ bool ggml_metal_add_buffer( struct ggml_metal_context * ctx, const char * name, void * data, - size_t size) { + size_t size, + size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { fprintf(stderr, "%s: too many buffers\n", __func__); return false; @@ -266,13 +273,8 @@ bool ggml_metal_add_buffer( ++ctx->n_buffers; } else { - // Example, say you want to map 16GB buffer. Create 3 views, each 8GB of size: - // - // view 0 has offset 0, i.e. range [0GB, 8GB] - // view 1 has offset 4GB, i.e range [4GB, 8GB] - // view 2 has offset 8GB, i.e. range [8GB, 16GB] - // - const size_t size_step = ctx->device.maxBufferLength/2; + const size_t size_ovlp = (max_size + size_page - 1) / size_page * size_page; + const size_t size_step = ctx->device.maxBufferLength - size_ovlp; const size_t size_view = ctx->device.maxBufferLength; for (size_t i = 0; i < size; i += size_step) { @@ -787,6 +789,70 @@ void ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; + case GGML_OP_NORM: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + const float eps = 1e-5f; + + const int nth = 256; + + [encoder setComputePipelineState:ctx->pipeline_norm]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ALIBI: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + GGML_ASSERT((src0t == GGML_TYPE_F32)); + + const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past); + const int n_head = ((int32_t *) src1->data)[1]; + const float max_bias = ((float *) src1->data)[2]; + + if (__builtin_popcount(n_head) != 1) { + GGML_ASSERT(false && "only power-of-two n_head implemented"); + } + + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + + [encoder setComputePipelineState:ctx->pipeline_alibi_f32]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&m0 length:sizeof( float) atIndex:18]; + const int nth = 32; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; case GGML_OP_ROPE: { if (encoder == nil) { @@ -840,6 +906,14 @@ void ggml_metal_graph_compute( default: GGML_ASSERT(false && "not implemented"); }; } break; + case GGML_TYPE_F16: + { + switch (dstt) { + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break; + case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break; + default: GGML_ASSERT(false && "not implemented"); + }; + } break; default: GGML_ASSERT(false && "not implemented"); } diff --git a/ggml-metal.metal b/ggml-metal.metal index 09e12a879..d1e49222d 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -256,6 +256,72 @@ kernel void kernel_get_rows_q4_1( (device float *) ((device char *) dst + i*nb1), ne00); } +kernel void kernel_norm( + device const void * src0, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant float & eps, + threadgroup float * sum [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01); + // MEAN + // parallel sum + sum[tpitg] = 0.0f; + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + sum[tpitg] += x[i00]; + } + // reduce + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint i = ntg/2; i > 0; i /= 2) { + if (tpitg < i) { + sum[tpitg] += sum[tpitg + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + // broadcast + if (tpitg == 0) { + sum[0] /= ne00; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + const float mean = sum[0]; + + // recenter + device float * y = dst + tgpig*ne00; + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + y[i00] = x[i00] - mean; + } + + // VARIANCE + // parallel sum + sum[tpitg] = 0.0f; + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + sum[tpitg] += y[i00] * y[i00]; + } + // reduce + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint i = ntg/2; i > 0; i /= 2) { + if (tpitg < i) { + sum[tpitg] += sum[tpitg + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + // broadcast + if (tpitg == 0) { + sum[0] /= ne00; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + const float variance = sum[0]; + + const float scale = 1.0f/sqrt(variance + eps); + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + y[i00] = y[i00] * scale; + } +} + + kernel void kernel_rms_norm( device const void * src0, device float * dst, @@ -485,6 +551,48 @@ kernel void kernel_mul_mat_f16_f32( } } +kernel void kernel_alibi_f32( + device const float * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant float & m0, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + + device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + float m_k = pow(m0, i2 + 1); + for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { + device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1); + } +} + kernel void kernel_rope( device const void * src0, device float * dst, @@ -540,6 +648,47 @@ kernel void kernel_rope( } } +kernel void kernel_cpy_f16_f16( + device const half * src0, + device half * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + + device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { + device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + dst_data[i00] = src[0]; + } +} + kernel void kernel_cpy_f32_f16( device const float * src0, device half * dst, diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 5df922abd..95f4cec6d 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -15,7 +15,11 @@ #include "ggml.h" -#define CL_DMMV_BLOCK_SIZE 32; +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#define CL_DMMV_BLOCK_SIZE 32 #define MULTILINE_QUOTE(...) #__VA_ARGS__ static std::string program_source = MULTILINE_QUOTE( @@ -59,6 +63,46 @@ struct __attribute__ ((packed)) block_q8_0 int8_t qs[QK8_0]; }; +struct __attribute__((packed)) block_q2_K +{ + uint8_t scales[16]; + uint8_t qs[64]; + half d; + half dmin; +}; + +struct __attribute__((packed)) block_q3_K +{ + uint8_t hmask[32]; + uint8_t qs[64]; + uint8_t scales[12]; + half d; +}; + +struct __attribute__((packed)) block_q4_K +{ + half d; + half dmin; + uint8_t scales[12]; + uint8_t qs[128]; +}; + +struct __attribute__((packed)) block_q5_K +{ + half d; + half dmin; + uint8_t scales[12]; + uint8_t qh[32]; + uint8_t qs[128]; +}; + +struct __attribute__((packed)) block_q6_K +{ + uint8_t ql[128]; + uint8_t qh[64]; + int8_t scales[16]; + half d; +}; __kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { const uint i = get_global_id(0); @@ -131,8 +175,314 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float *v0 = vload_half(0, &x[ib + 0]); *v1 = vload_half(0, &x[ib + 1]); } + +inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m) +{ + if (j < 4) + { + *d = q[j] & 63; + *m = q[j + 4] & 63; + } + else + { + *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); + *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); + } +} + +__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy) +{ + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int n = tid / 32; + const int l = tid - 32 * n; + const int is = 8 * n + l / 16; + + const uint8_t q = x[i].qs[32 * n + l]; + __global float *y = yy + i * 256 + 128 * n; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4); + y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4); + y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4); + y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4); +} + +__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy) +{ + int r = get_local_id(0) / 4; + int i = get_group_id(0); + int tid = r / 2; + int is0 = r % 2; + int l0 = 16 * is0 + 4 * (get_local_id(0) % 4); + int n = tid / 4; + int j = tid - 4 * n; + + uint8_t m = 1 << (4 * n + j); + int is = 8 * n + 2 * j + is0; + int shift = 2 * j; + + int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4) + : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4) + : is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4) + : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4); + float d_all = vload_half(0, &x[i].d); + float dl = d_all * (us - 32); + + __global float *y = yy + i * 256 + 128 * n + 32 * j; + const __global uint8_t *q = x[i].qs + 32 * n; + const __global uint8_t *hm = x[i].hmask; + + for (int l = l0; l < l0 + 4; ++l) + y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); +} + +__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy) +{ + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int il = tid / 8; + const int ir = tid % 8; + const int is = 2 * il; + const int n = 4; + + __global float *y = yy + i * 256 + 64 * il + n * ir; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + __global const uint8_t *q = x[i].qs + 32 * il + n * ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, &sc, &m); + float d1 = dall * sc; + float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, &sc, &m); + float d2 = dall * sc; + float m2 = dmin * m; + for (int l = 0; l < n; ++l) + { + y[l + 0] = d1 * (q[l] & 0xF) - m1; + y[l + 32] = d2 * (q[l] >> 4) - m2; + } +} + +__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy) +{ + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int il = tid / 16; + const int ir = tid % 16; + const int is = 2 * il; + + __global float *y = yy + i * 256 + 64 * il + 2 * ir; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir; + __global const uint8_t *qh = x[i].qh + 2 * ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, &sc, &m); + const float d1 = dall * sc; + const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, &sc, &m); + const float d2 = dall * sc; + const float m2 = dmin * m; + + uint8_t hm = 1 << (2 * il); + y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1; + y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1; + hm <<= 1; + y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2; + y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2; +} + +__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy) +{ + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int ip = tid / 32; + const int il = tid - 32 * ip; + const int is = 8 * ip + il / 16; + + __global float *y = yy + i * 256 + 128 * ip + il; + + const float d = vload_half(0, &x[i].d); + + __global const uint8_t *ql = x[i].ql + 64 * ip + il; + const uint8_t qh = x[i].qh[32 * ip + il]; + __global const int8_t *sc = x[i].scales + is; + + y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +} + + +void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + + int n = iqs / 128; + int r = iqs - 128 * n; + int l = r / 8; + + __global const float *y = yy + 128 * n + l; + __global const uint8_t *q = x[ib].qs + 32 * n + l; + __global const uint8_t *s = x[ib].scales + 8 * n; + + const float dall = vload_half(0, &x[ib].d); + const float dmin = vload_half(0, &x[ib].dmin); + + float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4)) + + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4)) + + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4)) + + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4)) + + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4)) + + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4)) + + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4)) + + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4)); + + *result = sum; +} + +void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + uint32_t aux[3]; + uint32_t utmp[4]; + + int n = iqs/128; + int r = iqs - 128*n; + int l = r/8; + + __global const float * y = yy + 128*n + l; + __global const uint8_t * q = x[ib].qs + 32*n + l; + __global const uint8_t * hm = x[ib].hmask + l; + const int8_t * s = (const int8_t *)utmp + 8*n; + + aux[0] = x[ib].scales[0] | x[ib].scales[1] << 8 | x[ib].scales[2] << 16 | x[ib].scales[3] << 24; + aux[1] = x[ib].scales[4] | x[ib].scales[5] << 8 | x[ib].scales[6] << 16 | x[ib].scales[7] << 24; + aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24; + + utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); + utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); + utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); + utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); + + const float dall = vload_half(0, &x[ib].d); + const uint8_t m = 1 << (4*n); + + float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4)) + + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4)) + + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4)) + + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4)) + + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4)) + + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4)) + + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4)) + + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4)); + + *result = sum * dall; + +} + +void vec_dot_q4_K(__global const struct block_q4_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + + const int j = iqs / 64; // j is in 0...3 + const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4 + const int is = 2*j; // is is in 0...6 in steps of 2 + + __global const float * y = yy + 64*j + ir; + __global const uint8_t * q = x[ib].qs + 32*j + ir; + + const float dall = vload_half(0, &x[ib].d); + const float dmin = vload_half(0, &x[ib].dmin); + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[ib].scales, &sc, &m); + const float d1 = dall * sc; + const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[ib].scales, &sc, &m); + const float d2 = dall * sc; + const float m2 = dmin * m; + + float sum = 0; + for (int k = 0; k < 4; ++k) { + sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1); + sum += y[k + 32] * (d2 * (q[k] >> 4) - m2); + } + + *result = sum; +} + +void vec_dot_q5_K(__global const struct block_q5_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + + const int j = iqs / 64; + const int ir = (iqs - 64*j)/2; + const int is = 2*j; + + __global const float * y = yy + 64*j + ir; + __global const uint8_t * ql = x[ib].qs + 32*j + ir; + __global const uint8_t * qh = x[ib].qh + ir; + + const float dall = vload_half(0, &x[ib].d); + const float dmin = vload_half(0, &x[ib].dmin); + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[ib].scales, &sc, &m); + const float d1 = dall * sc; + const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[ib].scales, &sc, &m); + const float d2 = dall * sc; + const float m2 = dmin * m; + + uint8_t hm = 1 << is; + float sum = 0; + for (int k = 0; k < 4; ++k) { + sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1); + } + hm <<= 1; + for (int k = 0; k < 4; ++k) { + sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2); + } + *result = sum; + +} + +void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + + + const int ip = iqs / 128; // 0 or 1 + const int il = (iqs - 128*ip)/8; // 0...15 + const int is = 8*ip; + + __global const float * y = yy + 128*ip + il; + + const float d = vload_half(0, &x[ib].d); + + __global const uint8_t * ql = x[ib].ql + 64*ip + il; + __global const uint8_t * qh = x[ib].qh + 32*ip + il; + __global const int8_t * sc = x[ib].scales + is; + + *result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32) + + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32) + + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32) + + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32) + + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32) + + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32) + + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32) + + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32); + +} + ); + std::string dequant_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2; @@ -160,7 +510,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { const int block_size = get_local_size(0); - const int row = get_global_id(0) / block_size; + const int row = get_group_id(0); const int tid = get_local_id(0); const uint qk = QUANT_K; @@ -199,6 +549,45 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float } ); +std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE( +__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_group_id(0); + const int tid = get_local_id(0); + + const int iter_stride = 256; + const int vals_per_iter = iter_stride / block_size; + const int num_blocks_per_row = ncols / 256; + const int ib0 = row*num_blocks_per_row; + + tmp[tid] = 0; + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = ib0 + col/256; // x block index + const int iqs = col%256; // x quant index + const int iybs = col - col%256; // y block start index + + // dequantize + float v; + DOT_KERNEL(x, ib, iqs, y + iybs, &v); + tmp[tid] += v; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} +); + std::string mul_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) { const int i = get_group_id(0)*get_local_size(0) + get_local_id(0); @@ -260,6 +649,18 @@ std::array mul_str_values = { "mul_f32", "float" }; +std::array dmmv_k_str_keys = { + "KERNEL_NAME", "X_TYPE", "DOT_KERNEL" +}; + +std::array dmmv_k_str_values = { + "dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K", + "dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K", + "dequantize_mul_mat_vec_q4_K", "struct block_q4_K", "vec_dot_q4_K", + "dequantize_mul_mat_vec_q5_K", "struct block_q5_K", "vec_dot_q5_K", + "dequantize_mul_mat_vec_q6_K", "struct block_q6_K", "vec_dot_q6_K", +}; + std::string& replace(std::string& s, const std::string& from, const std::string& to) { size_t pos = 0; while ((pos = s.find(from, pos)) != std::string::npos) { @@ -289,6 +690,14 @@ std::string generate_kernels() { } src << mul_kernel << '\n'; } + for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) { + std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template; + for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) { + replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]); + } + src << dmmv_k_kernel << '\n'; + } + return src.str(); } @@ -300,6 +709,8 @@ static cl_program program; static cl_kernel convert_row_f16_cl; static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; +static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl; +static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl; static cl_kernel mul_f32_cl; static bool fp16_support; @@ -529,6 +940,12 @@ void ggml_cl_init(void) { CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err)); CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err)); CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); + CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); + CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err)); + CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err)); + CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err)); + CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err)); + CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err)); // dequant mul mat kernel CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err)); @@ -537,6 +954,11 @@ void ggml_cl_init(void) { CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err)); CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err)); CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err)); // mul kernel CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); @@ -554,6 +976,16 @@ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { return &dequantize_row_q5_1_cl; case GGML_TYPE_Q8_0: return &dequantize_row_q8_0_cl; + case GGML_TYPE_Q2_K: + return &dequantize_block_q2_k_cl; + case GGML_TYPE_Q3_K: + return &dequantize_block_q3_k_cl; + case GGML_TYPE_Q4_K: + return &dequantize_block_q4_k_cl; + case GGML_TYPE_Q5_K: + return &dequantize_block_q5_k_cl; + case GGML_TYPE_Q6_K: + return &dequantize_block_q6_k_cl; case GGML_TYPE_F16: return &convert_row_f16_cl; default: @@ -561,6 +993,50 @@ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { } } +static size_t ggml_cl_global_denom(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 1; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + return 4; + case GGML_TYPE_Q4_K: + return 8; + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return 4; + case GGML_TYPE_F16: + default: + return 1; + } +} + +static size_t ggml_cl_local_size(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 0; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + return 64; + case GGML_TYPE_Q4_K: + return 32; + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return 64; + case GGML_TYPE_F16: + default: + return 0; + } +} + static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: @@ -575,6 +1051,16 @@ static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) { return &dequantize_mul_mat_vec_q8_0_cl; case GGML_TYPE_F16: return &convert_mul_mat_vec_f16_cl; + case GGML_TYPE_Q2_K: + return &dequantize_mul_mat_vec_q2_K_cl; + case GGML_TYPE_Q3_K: + return &dequantize_mul_mat_vec_q3_K_cl; + case GGML_TYPE_Q4_K: + return &dequantize_mul_mat_vec_q4_K_cl; + case GGML_TYPE_Q5_K: + return &dequantize_mul_mat_vec_q5_K_cl; + case GGML_TYPE_Q6_K: + return &dequantize_mul_mat_vec_q6_K_cl; default: return nullptr; } @@ -1017,6 +1503,9 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type); GGML_ASSERT(to_fp32_cl != nullptr); + const size_t global_denom = ggml_cl_global_denom(type); + const size_t local = ggml_cl_local_size(type); + size_t ev_idx = 0; std::vector events; @@ -1049,10 +1538,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++)); } else { // general dequantization kernel + CLBlast matrix matrix multiplication // convert src0 to fp32 on device - const size_t global = x_ne; + const size_t global = x_ne / global_denom; CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); - CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); + CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); // copy src1 to device CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); diff --git a/ggml.c b/ggml.c index c0efa1977..0eda7f338 100644 --- a/ggml.c +++ b/ggml.c @@ -35,6 +35,12 @@ #define static_assert(cond, msg) struct global_scope_noop_trick #endif +#if defined(_MSC_VER) +// disable "possible loss of data" to avoid hundreds of casts +// we should just be careful :) +#pragma warning(disable: 4244 4267) +#endif + #if defined(_WIN32) #include diff --git a/llama.cpp b/llama.cpp index b8bc0d821..a2916b3e8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -40,6 +40,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 @@ -882,6 +886,7 @@ static bool kv_cache_init( const int64_t n_elements = n_embd*n_mem; cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + cache.n = 0; struct ggml_init_params params; params.mem_size = cache.buf.size; @@ -900,6 +905,7 @@ static bool kv_cache_init( ggml_set_name(cache.k, "cache_k"); ggml_set_name(cache.v, "cache_v"); + (void) n_gpu_layers; #ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer + 1) { ggml_cuda_assign_buffers_no_scratch(cache.v); @@ -1249,7 +1255,7 @@ static void llama_model_load_internal( vram_scratch = n_batch * MB; ggml_cuda_set_scratch_size(vram_scratch); if (n_gpu_layers > 0) { - fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n", + fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n", __func__, vram_scratch / MB); } } @@ -1654,7 +1660,7 @@ static bool llama_eval_internal( // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.norm); - offload_func_nr(cur); + // offload_func_nr(cur); // TODO CPU + GPU mirrored backend ggml_set_name(cur, "result_norm"); embeddings = cur; diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 26bf50c9a..7b18090d6 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -10,6 +10,10 @@ #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + constexpr int kVecSize = 1 << 18; float drawFromGaussianPdf(std::mt19937& rndm) { diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py index 2ce572826..d12748281 100644 --- a/scripts/verify-checksum-models.py +++ b/scripts/verify-checksum-models.py @@ -1,9 +1,10 @@ import os import hashlib + def sha256sum(file): block_size = 16 * 1024 * 1024 # 16 MB block size - b = bytearray(block_size) + b = bytearray(block_size) file_hash = hashlib.sha256() mv = memoryview(b) with open(file, 'rb', buffering=0) as f: @@ -15,6 +16,7 @@ def sha256sum(file): return file_hash.hexdigest() + # Define the path to the llama directory (parent folder of script directory) llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 728460b5e..c40f1b29c 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -9,12 +9,15 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif -const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001; -const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002; -const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075; -const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040; -const float MAX_DOT_PRODUCT_ERROR = 0.02; +const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; +const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; +const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; +const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; +const float MAX_DOT_PRODUCT_ERROR = 0.02f; const char* RESULT_STR[] = {"ok", "FAILED"}; diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index d5514455d..600375771 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -13,6 +13,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + #define MAX_ALIGNMENT 64 #define QK 32 #define WARMUP 5 diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 0e675127f..5d693f7b5 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -176,27 +176,27 @@ void test_frequency_presence_penalty( int main(void) { ggml_time_init(); - test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4}, 1); - test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2}, 3); + test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1); + test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3); - test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4}, 0); - test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3}, 0.7); - test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2, 0.1}, 1); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1); - test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3}, 0.25); - test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.75); - test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.99); + test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f); + test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f); + test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f); - test_typical({0.97, 0.01, 0.01, 0.01}, {0.97}, 0.5); - test_typical({0.4, 0.2, 0.2, 0.2}, {0.2, 0.2, 0.2}, 0.5); + test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f); + test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f); - test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.25, 0.25, 0.25, 0.25, 0}, 50.0); - test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.5, 0.5, 0, 0, 0}, 50.0); - test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.5, 0.5, 0, 0, 0}, 50.0); + test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f); + test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f); + test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f); - test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.249997, 0.249997, 0.249997, 0.249997, 0.000011}, 5.0, 5.0); - test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.499966, 0.499966, 0.000023, 0.000023, 0.000023}, 5.0, 5.0); - test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.499977, 0.499977, 0.000023, 0.000023, 0.000000}, 5.0, 5.0); + test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f); + test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f); + test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f); printf("OK\n"); } diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index b08984571..ab1538a0c 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -53,7 +53,7 @@ int main(int argc, char **argv) { for (const auto & test_kv : k_tests()) { std::vector res(test_kv.first.size()); - const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true); + const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true); res.resize(n); bool correct = res.size() == test_kv.second.size();