From 2edbdb0f99336cb41f0995061c7602ed54beb863 Mon Sep 17 00:00:00 2001 From: 44670 <44670@users.noreply.github.com> Date: Thu, 4 May 2023 23:41:12 +0800 Subject: [PATCH 01/10] main : add --in-suffix option (#1318) * adding --in-suffix option * print input suffix before generation --- examples/common.cpp | 7 +++++++ examples/common.h | 1 + examples/main/README.md | 8 ++++++++ examples/main/main.cpp | 9 +++++++++ 4 files changed, 25 insertions(+) diff --git a/examples/common.cpp b/examples/common.cpp index cd6300041..97eded6ec 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -324,6 +324,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.input_prefix = argv[i]; + } else if (arg == "--in-suffix") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.input_suffix = argv[i]; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); gpt_print_usage(argc, argv, default_params); @@ -362,6 +368,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n"); fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); + fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); fprintf(stderr, " -f FNAME, --file FNAME\n"); fprintf(stderr, " prompt file to start generation.\n"); fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); diff --git a/examples/common.h b/examples/common.h index 138d0ded0..842e1516f 100644 --- a/examples/common.h +++ b/examples/common.h @@ -43,6 +43,7 @@ struct gpt_params { std::string prompt = ""; std::string path_session = ""; // path to file for saving/loading model eval state std::string input_prefix = ""; // string to prefix user inputs with + std::string input_suffix = ""; // string to suffix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted std::string lora_adapter = ""; // lora adapter path diff --git a/examples/main/README.md b/examples/main/README.md index 6b7facb3b..35f87bcd5 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -112,6 +112,14 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is ./main -r "User:" --in-prefix " " ``` +### In-Suffix + +The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag: + +```sh +./main -r "User:" --in-prefix " " --in-suffix "Assistant:" +``` + ### Instruction Mode Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks: diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 17a5a90d1..43dca8eb5 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -260,6 +260,10 @@ int main(int argc, char ** argv) { if (!params.input_prefix.empty()) { fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); } + + if (!params.input_suffix.empty()) { + fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str()); + } } fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); @@ -567,6 +571,11 @@ int main(int argc, char ** argv) { // Add tokens to embd only if the input buffer is non-empty // Entering a empty line lets the user pass control back if (buffer.length() > 1) { + // append input suffix if any + if (!params.input_suffix.empty()) { + buffer += params.input_suffix; + printf("%s", params.input_suffix.c_str()); + } // instruct mode: insert instruction prefix if (params.instruct && !is_antiprompt) { From 360cfe5bec852805b84eec799102fc6f45df9fef Mon Sep 17 00:00:00 2001 From: 44670 <44670@users.noreply.github.com> Date: Fri, 5 May 2023 00:33:31 +0800 Subject: [PATCH 02/10] readme : add OpenBuddy link (#1321) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0002f8cc1..f1fa63542 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ as the main playground for developing new features for the [ggml](https://github - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) +- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy) **Bindings:** From d3e8093e9b5845514b049ede3b12728c8f013eba Mon Sep 17 00:00:00 2001 From: Ivan Stepanov Date: Thu, 4 May 2023 19:54:37 +0300 Subject: [PATCH 03/10] convert: support DT_BF16 tensors (#1309) Co-authored-by: Pavol Rusnak --- convert.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 7f7ae05fa..c817a343e 100644 --- a/convert.py +++ b/convert.py @@ -67,6 +67,7 @@ FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \ {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()} DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = { + DT_BF16: np.dtype(np.uint16), DT_F16: np.dtype(np.float16), DT_F32: np.dtype(np.float32), DT_I32: np.dtype(np.int32), @@ -276,6 +277,12 @@ class Tensor(metaclass=ABCMeta): def to_ggml(self) -> 'GGMLCompatibleTensor': ... +def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray: + assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}" + fp32_arr = bf16_arr.astype(np.uint32) << 16 + return fp32_arr.view(np.float32) + + class UnquantizedTensor(Tensor): def __init__(self, ndarray: NDArray) -> None: assert isinstance(ndarray, np.ndarray) @@ -284,6 +291,8 @@ class UnquantizedTensor(Tensor): def astype(self, data_type: DataType) -> Tensor: dtype = DATA_TYPE_TO_NUMPY[data_type] + if self.data_type == DT_BF16: + self.ndarray = bf16_to_fp32(self.ndarray) return UnquantizedTensor(self.ndarray.astype(dtype)) def to_ggml(self) -> 'UnquantizedTensor': @@ -686,6 +695,7 @@ class LazyUnpickler(pickle.Unpickler): description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' return LazyStorage(load=load, kind=pid[1], description=description) + @staticmethod def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName] requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: assert isinstance(storage, LazyStorage) @@ -696,12 +706,18 @@ class LazyUnpickler(pickle.Unpickler): description = f'pickled storage_offset={storage_offset} in {storage.description}' return LazyTensor(load, list(size), storage.kind.data_type, description) + @staticmethod + def rebuild_from_type_v2(func, new_type, args, state): + return func(*args) + CLASSES: Dict[Any, Any] = { + ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2, ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2, ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), ('torch', 'IntStorage'): LazyStorageKind(DT_I32), + ('torch', 'Tensor'): LazyTensor, } def find_class(self, module: str, name: str) -> Any: @@ -961,7 +977,7 @@ class OutputFile: def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType: wq_type = model["layers.0.attention.wq.weight"].data_type - if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): + if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)): return GGMLFileType.AllF32 if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16): return GGMLFileType.MostlyF16 From 34d9f22f44c42d345cc72c8f3aa4cb71c5df0acb Mon Sep 17 00:00:00 2001 From: Ivan Stepanov Date: Thu, 4 May 2023 19:56:27 +0300 Subject: [PATCH 04/10] Wrap exceptions in std::exception to verbose output on exception. (#1316) --- llama-util.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/llama-util.h b/llama-util.h index d531588d5..88ec28dca 100644 --- a/llama-util.h +++ b/llama-util.h @@ -14,6 +14,7 @@ #include #include +#include #ifdef __has_include #if __has_include() @@ -74,7 +75,7 @@ struct llama_file { llama_file(const char * fname, const char * mode) { fp = std::fopen(fname, mode); if (fp == NULL) { - throw format("failed to open %s: %s", fname, std::strerror(errno)); + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); } seek(0, SEEK_END); size = tell(); @@ -107,10 +108,10 @@ struct llama_file { errno = 0; std::size_t ret = std::fread(ptr, size, 1, fp); if (ferror(fp)) { - throw format("read error: %s", strerror(errno)); + throw std::runtime_error(format("read error: %s", strerror(errno))); } if (ret != 1) { - throw std::string("unexpectedly reached end of file"); + throw std::runtime_error(std::string("unexpectedly reached end of file")); } } @@ -133,7 +134,7 @@ struct llama_file { errno = 0; size_t ret = std::fwrite(ptr, size, 1, fp); if (ret != 1) { - throw format("write error: %s", strerror(errno)); + throw std::runtime_error(format("write error: %s", strerror(errno))); } } @@ -180,7 +181,7 @@ struct llama_mmap { #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); if (addr == MAP_FAILED) { - throw format("mmap failed: %s", strerror(errno)); + throw std::runtime_error(format("mmap failed: %s", strerror(errno))); } if (prefetch) { @@ -207,7 +208,7 @@ struct llama_mmap { DWORD error = GetLastError(); if (hMapping == NULL) { - throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()); + throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); } addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); @@ -215,7 +216,7 @@ struct llama_mmap { CloseHandle(hMapping); if (addr == NULL) { - throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()); + throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); } #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 @@ -245,7 +246,7 @@ struct llama_mmap { llama_mmap(struct llama_file *, bool prefetch = true) { (void)prefetch; - throw std::string("mmap not supported"); + throw std::runtime_error(std::string("mmap not supported")); } #endif }; From 94c5652fc0f4d04ac54412c4d81e2ebcdafb6ede Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 5 May 2023 00:58:56 +0200 Subject: [PATCH 05/10] quantize: make output filename optional, default to ggml-model-.bin (#1301) --- examples/quantize/quantize.cpp | 100 ++++++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 19 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 198bd5fcb..7c77018da 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -6,23 +6,47 @@ #include #include -static const std::map LLAMA_FTYPE_MAP = { - {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, - {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, - {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2}, - {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, - {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1}, - {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, +static const std::map LLAMA_FTYPE_MAP = { + {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, + {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, + {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2}, + {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, + {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1}, + {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, }; +bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) { + auto it = LLAMA_FTYPE_MAP.find(ftype_str); + if (it != LLAMA_FTYPE_MAP.end()) { + ftype = it->second; + ftype_str_out = it->first; + return true; + } + // try to parse as an integer + try { + int ftype_int = std::stoi(ftype_str); + for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) { + if (it->second == ftype_int) { + ftype = it->second; + ftype_str_out = it->first; + return true; + } + } + } + catch (...) { + // stoi failed + } + return false; +} + // usage: -// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type +// ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] // int main(int argc, char ** argv) { ggml_time_init(); - if (argc < 4) { - fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]); + if (argc < 3) { + fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]); for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) { fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second); } @@ -36,24 +60,62 @@ int main(int argc, char ** argv) { ggml_free(ctx); } + // parse command line arguments const std::string fname_inp = argv[1]; - const std::string fname_out = argv[2]; + std::string fname_out; + int nthread; + llama_ftype ftype; - enum llama_ftype ftype; - if (argv[3][0] == 'q') { - auto it = LLAMA_FTYPE_MAP.find(argv[3]); - if (it == LLAMA_FTYPE_MAP.end()) { - fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]); + int arg_idx = 2; + std::string ftype_str; + if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) { + // argv[2] is the ftype + std::string fpath; + const size_t pos = fname_inp.find_last_of('/'); + if (pos != std::string::npos) { + fpath = fname_inp.substr(0, pos + 1); + } + // export as [inp path]/ggml-model-[ftype].bin + fname_out = fpath + "ggml-model-" + ftype_str + ".bin"; + arg_idx++; + } + else { + // argv[2] is the output path + fname_out = argv[arg_idx]; + arg_idx++; + + if (argc <= arg_idx) { + fprintf(stderr, "%s: missing ftype\n", __func__); + return 1; + } + // argv[3] is the ftype + if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) { + fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); + return 1; + } + arg_idx++; + } + + // parse nthreads + if (argc > arg_idx) { + try { + nthread = std::stoi(argv[arg_idx]); + } + catch (const std::exception & e) { + fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what()); return 1; } - ftype = it->second; } else { - ftype = (enum llama_ftype)atoi(argv[3]); + nthread = 0; } fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); - int nthread = argc > 4 ? atoi(argv[4]) : 0; + fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); + if (nthread > 0) { + fprintf(stderr, " using %d threads", nthread); + } + fprintf(stderr, "\n"); const int64_t t_main_start_us = ggml_time_us(); From a90e96b266873ebb5e947c9864b12193bdada0fb Mon Sep 17 00:00:00 2001 From: Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> Date: Fri, 5 May 2023 02:17:07 +0200 Subject: [PATCH 06/10] Convert.py @staticmethod (#1327) * Line 698 has one #staticmethod and should not otherwise throw error at unpickle.load() as not callable * Update convert.py --------- Co-authored-by: Ivan Stepanov --- convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index c817a343e..126beaabc 100644 --- a/convert.py +++ b/convert.py @@ -695,7 +695,7 @@ class LazyUnpickler(pickle.Unpickler): description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' return LazyStorage(load=load, kind=pid[1], description=description) - @staticmethod + # @staticmethod def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName] requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: assert isinstance(storage, LazyStorage) @@ -706,7 +706,7 @@ class LazyUnpickler(pickle.Unpickler): description = f'pickled storage_offset={storage_offset} in {storage.description}' return LazyTensor(load, list(size), storage.kind.data_type, description) - @staticmethod + # @staticmethod def rebuild_from_type_v2(func, new_type, args, state): return func(*args) From 2d13786e91ec9fd28ddf737053822042a824da78 Mon Sep 17 00:00:00 2001 From: Ionoclast Laboratories Date: Fri, 5 May 2023 08:18:21 -0400 Subject: [PATCH 07/10] Fix for OpenCL / clbast builds on macOS. (#1329) --- Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 94acefdde..260b2487f 100644 --- a/Makefile +++ b/Makefile @@ -121,7 +121,12 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h endif ifdef LLAMA_CLBLAST CFLAGS += -DGGML_USE_CLBLAST - LDFLAGS += -lclblast -lOpenCL + # Mac provides OpenCL as a framework + ifeq ($(UNAME_S),Darwin) + LDFLAGS += -lclblast -framework OpenCL + else + LDFLAGS += -lclblast -lOpenCL + endif OBJS += ggml-opencl.o ggml-opencl.o: ggml-opencl.c ggml-opencl.h $(CC) $(CFLAGS) -c $< -o $@ From 921dcee00a55d9aba3b3026d0509d31ac8386e2a Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Fri, 5 May 2023 16:43:36 +0200 Subject: [PATCH 08/10] readme: add missing info (#1324) --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f1fa63542..233c5c5e1 100644 --- a/README.md +++ b/README.md @@ -18,10 +18,12 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant - Plain C/C++ implementation without dependencies - Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework -- AVX2 support for x86 architectures +- AVX, AVX2 and AVX512 support for x86 architectures - Mixed F16 / F32 precision -- 4-bit integer quantization support +- 4-bit, 5-bit and 8-bit integer quantization support - Runs on the CPU +- OpenBLAS support +- cuBLAS and CLBlast support The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022). Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves From a3b85b28da84c67c3406807aef5e0457bcc4b00f Mon Sep 17 00:00:00 2001 From: Erik Scholz Date: Fri, 5 May 2023 22:56:09 +0200 Subject: [PATCH 09/10] ci : add cublas to windows release (#1271) --- .github/workflows/build.yml | 77 +++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 179080576..18bb33f94 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -210,6 +210,82 @@ jobs: path: | llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip + windows-latest-cmake-cublas: + runs-on: windows-latest + + strategy: + matrix: + cuda: ['12.1.0', '11.7.1'] + build: ['cublas'] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v1 + + - uses: Jimver/cuda-toolkit@v0.2.10 + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda }} + # TODO(green-sky): _dev seems to fail, and non dev are not enought + #sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]' + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake .. -DLLAMA_CUBLAS=ON + cmake --build . --config Release + + - name: Get commit hash + id: commit + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: pr-mpt/actions-commit-hash@v2 + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + 7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v3 + with: + path: | + llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip + + - name: Copy and pack Cuda runtime + if: ${{ matrix.cuda == '12.1.0' }} + # TODO(green-sky): paths are cuda 12 specific + run: | + echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" + mkdir '.\build\bin\cudart\' + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\' + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\' + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\' + 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\* + + - name: Copy and pack Cuda runtime + if: ${{ matrix.cuda == '11.7.1' }} + # TODO(green-sky): paths are cuda 11 specific + run: | + echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" + mkdir '.\build\bin\cudart\' + ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\' + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\' + cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\' + 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\* + + - name: Upload Cuda runtime + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v3 + with: + path: | + cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} @@ -221,6 +297,7 @@ jobs: - macOS-latest-make - macOS-latest-cmake - windows-latest-cmake + - windows-latest-cmake-cublas steps: - name: Download artifacts From 173d0e6419e8f8f3c1f4f13201b777f4c60629f3 Mon Sep 17 00:00:00 2001 From: DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com> Date: Fri, 5 May 2023 23:57:14 +0200 Subject: [PATCH 10/10] makefile: automatic Arch Linux detection (#1332) This commit is a port of a detection method used in koboldcpp's Makefile in order to automatically set the -lcblas option on Arch Linux --- Makefile | 6 +++++- README.md | 1 - 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 260b2487f..0ddff9961 100644 --- a/Makefile +++ b/Makefile @@ -107,7 +107,11 @@ ifndef LLAMA_NO_ACCELERATE endif ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas - LDFLAGS += -lopenblas + ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),) + LDFLAGS += -lopenblas -lcblas + else + LDFLAGS += -lopenblas + endif endif ifdef LLAMA_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include diff --git a/README.md b/README.md index 233c5c5e1..19cc94aa2 100644 --- a/README.md +++ b/README.md @@ -216,7 +216,6 @@ Building the program with BLAS support may lead to some performance improvements ```bash make LLAMA_OPENBLAS=1 ``` - Note: In order to build on Arch Linux with OpenBLAS support enabled you must edit the Makefile adding at the end of the line 105: `-lcblas` - On Windows: