From e18f7345a300920e234f732077bda660cc6cda9c Mon Sep 17 00:00:00 2001 From: "Xiang (Kevin) Li" Date: Sat, 9 Dec 2023 16:29:27 -0500 Subject: [PATCH 1/8] grammar : revert the replacement of llama_token_to_piece with id_to_token (#4396) --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index b12bbd1b0..93d8f3e16 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7503,7 +7503,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c for (size_t i = 0; i < candidates->size; ++i) { const llama_token id = candidates->data[i].id; - const std::string & piece = ctx->model.vocab.id_to_token[id].text; + const std::string piece = llama_token_to_piece(ctx, id); if (id == eos) { if (!allow_eos) { candidates->data[i].logit = -INFINITY; @@ -7715,7 +7715,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar GGML_ASSERT(false); } - const std::string & piece = ctx->model.vocab.id_to_token[token].text; + const std::string piece = llama_token_to_piece(ctx, token); // Note terminating 0 in decoded string const auto decoded = decode_utf8(piece, grammar->partial_utf8); From 8a7b2fa528f130631a5f43648481596ab320ed5a Mon Sep 17 00:00:00 2001 From: Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> Date: Mon, 11 Dec 2023 06:27:38 +0800 Subject: [PATCH 2/8] Update README.md (#4388) Fix small typo. --- examples/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index cfc220f58..0751b9612 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -222,7 +222,7 @@ node index.js `content`: Set the text to process. - **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. +- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. *Options:* From 41a11aaf99feff4901e4c8dc48ad00766c5da4e9 Mon Sep 17 00:00:00 2001 From: Taikono-Himazin Date: Tue, 12 Dec 2023 18:24:32 +0900 Subject: [PATCH 3/8] ggml : increased GGML_MAX_PARAMS to allow finetuning of 70b models (#4424) --- ggml.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.h b/ggml.h index a8f10cbd5..41a075e92 100644 --- a/ggml.h +++ b/ggml.h @@ -215,7 +215,7 @@ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this #define GGML_MAX_DIMS 4 -#define GGML_MAX_PARAMS 1024 +#define GGML_MAX_PARAMS 2048 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 6 #define GGML_MAX_NAME 64 From d9d4cfef64ea416dd66632173787d03ffb180cc7 Mon Sep 17 00:00:00 2001 From: Vladimir Zorin Date: Tue, 12 Dec 2023 11:25:29 +0200 Subject: [PATCH 4/8] server : fix local model name in server (#4420) --- examples/server/server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 895f751c9..d0cd8e1cd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2382,6 +2382,7 @@ json oaicompat_completion_params_parse( llama_params["__oaicompat"] = true; // Map OpenAI parameters to llama.cpp parameters + llama_params["model"] = json_value(body, "model", std::string("uknown")); llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["temperature"] = json_value(body, "temperature", 0.8); From 6391817cd19a4507c6c941a1fd08756268662b2d Mon Sep 17 00:00:00 2001 From: crasm Date: Tue, 12 Dec 2023 04:25:57 -0500 Subject: [PATCH 5/8] llama : document logits_all deprecation (#4418) llama_context_params.logits_all is a parameter for controlling llama_eval. This documents that logits_all should not be used with llama_decode and llama_batch. --- llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.h b/llama.h index b1f5fca62..45a65cacb 100644 --- a/llama.h +++ b/llama.h @@ -216,7 +216,7 @@ extern "C" { // Keep the booleans together to avoid misalignment during copy-by-value. bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) - bool logits_all; // the llama_eval() call computes all logits, not just the last one + bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU }; From 6138963fb232cbae70c9d181db0ba125708f473d Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 12 Dec 2023 04:27:26 -0500 Subject: [PATCH 6/8] build : target Windows 8 for standard mingw-w64 (#4405) * build : target Windows 8 for standard mingw-w64 * make : fix missing console.o deps This was causing a link error with `make all` on Windows. --- CMakeLists.txt | 5 +++++ Makefile | 17 ++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 78de2dd1a..eea4673d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -593,6 +593,11 @@ else() message(STATUS "Unknown architecture") endif() +if (MINGW) + # Target Windows 8 for PrefetchVirtualMemory + add_compile_definitions(_WIN32_WINNT=0x602) +endif() + # # POSIX conformance # diff --git a/Makefile b/Makefile index a1a6cae54..e77595952 100644 --- a/Makefile +++ b/Makefile @@ -306,12 +306,15 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) #MK_CXXFLAGS += -mssse3 endif -# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 -# https://github.com/ggerganov/llama.cpp/issues/2922 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' + # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 + # https://github.com/ggerganov/llama.cpp/issues/2922 MK_CFLAGS += -Xassembler -muse-unaligned-vector-move MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move + + # Target Windows 8 for PrefetchVirtualMemory + MK_CPPFLAGS += -D_WIN32_WINNT=0x602 endif ifneq ($(filter aarch64%,$(UNAME_M)),) @@ -730,16 +733,16 @@ tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS) tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS) From 9494d7c4774ab745490b5a19570ff7747a194143 Mon Sep 17 00:00:00 2001 From: Richard Kiss Date: Tue, 12 Dec 2023 01:53:36 -0800 Subject: [PATCH 7/8] english : use `typos` to fix comments and logs (#4354) --- common/log.h | 8 ++++---- convert.py | 4 ++-- examples/llava/clip.cpp | 2 +- examples/llava/convert-image-encoder-to-gguf.py | 2 +- examples/lookahead/README.md | 2 +- examples/server/json.hpp | 2 +- examples/server/public/completion.js | 2 +- examples/server/public/index.html | 6 +++--- examples/speculative/README.md | 2 +- examples/speculative/speculative.cpp | 2 +- ggml-alloc.h | 2 +- ggml-quants.c | 4 ++-- ggml.c | 12 ++++++------ gguf-py/README.md | 2 +- llama.cpp | 10 +++++----- tests/test-grad0.cpp | 2 +- tests/test-quantize-perf.cpp | 4 ++-- 17 files changed, 34 insertions(+), 34 deletions(-) diff --git a/common/log.h b/common/log.h index c0e814861..e4e1b9f4f 100644 --- a/common/log.h +++ b/common/log.h @@ -61,13 +61,13 @@ // #define LOG_TARGET stderr // #include "log.h" // -// The log target can also be redirected to a diffrent function +// The log target can also be redirected to a different function // like so: // -// #define LOG_TARGET log_handler_diffrent() +// #define LOG_TARGET log_handler_different() // #include "log.h" // -// FILE* log_handler_diffrent() +// FILE* log_handler_different() // { // return stderr; // } @@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS // Disables logs entirely at runtime. // Makes LOG() and LOG_TEE() produce no output, -// untill enabled back. +// until enabled back. #define log_disable() log_disable_impl() // INTERNAL, DO NOT USE diff --git a/convert.py b/convert.py index 6e95d6cb3..a6fc6b8ea 100755 --- a/convert.py +++ b/convert.py @@ -585,7 +585,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: if any("model.embed_tokens.weight" in mp.model for mp in models_plus): # Transformers models put different tensors in different files, but - # don't split indivdual tensors between files. + # don't split individual tensors between files. model: LazyModel = {} for mp in models_plus: model.update(mp.model) @@ -678,7 +678,7 @@ class LazyUnpickler(pickle.Unpickler): return func(*args) CLASSES: dict[tuple[str, str], Any] = { - # getattr used here as a workaround for mypy not being smart enough to detrmine + # getattr used here as a workaround for mypy not being smart enough to determine # the staticmethods have a __func__ attribute. ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index fc0656c23..4bb7b93b6 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -739,7 +739,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip temp->ny = longer_side; temp->size = 3 * longer_side * longer_side; temp->data = new uint8_t[temp->size](); - uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA + uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA // fill with background color for (size_t i = 0; i < temp->size; i++) { diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index 729aaef8f..03688e0ea 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -51,7 +51,7 @@ def bytes_to_unicode(): The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a signficant percentage of your normal, say, 32K bpe vocab. + This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ diff --git a/examples/lookahead/README.md b/examples/lookahead/README.md index 252a6689e..a69a471b4 100644 --- a/examples/lookahead/README.md +++ b/examples/lookahead/README.md @@ -1,6 +1,6 @@ # llama.cpp/examples/lookahead -Demonstartion of lookahead decoding technique: +Demonstration of lookahead decoding technique: https://lmsys.org/blog/2023-11-21-lookahead-decoding/ diff --git a/examples/server/json.hpp b/examples/server/json.hpp index 4d1a37ad7..ea945f346 100644 --- a/examples/server/json.hpp +++ b/examples/server/json.hpp @@ -11227,7 +11227,7 @@ class binary_reader } if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array { - return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimentional vector is not allowed", "size"), nullptr)); + return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimensional vector is not allowed", "size"), nullptr)); } std::vector dim; if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim))) diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index b9c442509..c281f0fbd 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -114,7 +114,7 @@ export async function* llama(prompt, params = {}, config = {}) { return content; } -// Call llama, return an event target that you can subcribe to +// Call llama, return an event target that you can subscribe to // // Example: // diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 175c52478..18a6ccf0f 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -238,7 +238,7 @@ cache_prompt: true }) - /* START: Support for storing prompt templates and parameters in borwser LocalStorage */ + /* START: Support for storing prompt templates and parameters in browsers LocalStorage */ const local_storage_storageKey = "llamacpp_server_local_storage"; @@ -282,7 +282,7 @@ let importedTemplates = local_storage_getDataAsObject('user_templates') if (importedTemplates) { - // saved templates were successfuly imported. + // saved templates were successfully imported. console.log('Processing saved templates and updating default template') params.value = { ...params.value, image_data: [] }; @@ -303,7 +303,7 @@ } function userTemplateResetToDefault() { - console.log('Reseting themplate to default') + console.log('Resetting template to default') selectedUserTemplate.value.name = 'default'; selectedUserTemplate.value.data = savedUserTemplates.value['default']; } diff --git a/examples/speculative/README.md b/examples/speculative/README.md index d88fd3790..814efa592 100644 --- a/examples/speculative/README.md +++ b/examples/speculative/README.md @@ -1,6 +1,6 @@ # llama.cpp/examples/speculative -Demonstartion of speculative decoding and tree-based speculative decoding techniques +Demonstration of speculative decoding and tree-based speculative decoding techniques More info: diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index dca3f84a5..20f1fb5bf 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -428,7 +428,7 @@ int main(int argc, char ** argv) { ++n_past_tgt; } - // the first token is always proposed by the traget model before the speculation loop so we erase it here + // the first token is always proposed by the target model before the speculation loop so we erase it here for (int s = 0; s < n_seq_dft; ++s) { if (!drafts[s].active) { continue; diff --git a/ggml-alloc.h b/ggml-alloc.h index ad87cebc8..64a412468 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -43,7 +43,7 @@ GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph // ggml-backend v2 API // -// Seperate tensor and graph allocator objects +// Separate tensor and graph allocator objects // This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators // The original API is kept as a wrapper around the new API diff --git a/ggml-quants.c b/ggml-quants.c index 7285d5f7f..0e8163a16 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3114,7 +3114,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri size_t vl = __riscv_vsetvl_e8m1(qk/2); - // These tempory registers are for masking and shift operations + // These temporary registers are for masking and shift operations vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl); vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl); @@ -4757,7 +4757,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri vl = 16; - // retreive lane to multiply with scale + // retrieve lane to multiply with scale vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl); vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl); vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl); diff --git a/ggml.c b/ggml.c index ca56f063c..eb7989dc4 100644 --- a/ggml.c +++ b/ggml.c @@ -1,4 +1,4 @@ -#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows +#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows #define _USE_MATH_DEFINES // For M_PI on MSVC #include "ggml-impl.h" @@ -33,7 +33,7 @@ // we should just be careful :) #pragma warning(disable: 4244 4267) -// disable POSIX deprecation warnigns +// disable POSIX deprecation warnings // these functions are never going away, anyway #pragma warning(disable: 4996) #endif @@ -1760,7 +1760,7 @@ static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); // WARN: -// Mis-confguration can lead to problem that's hard to reason about: +// Mis-configuration can lead to problem that's hard to reason about: // * At best it crash or talks nosense. // * At worst it talks slightly difference but hard to perceive. // @@ -7520,7 +7520,7 @@ static void ggml_compute_forward_acc_f32( GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); // view src0 and dst with these strides and data offset inbytes during acc - // nb0 is implicitely element_size because src0 and dst are contiguous + // nb0 is implicitly element_size because src0 and dst are contiguous size_t nb1 = ((int32_t *) dst->op_params)[0]; size_t nb2 = ((int32_t *) dst->op_params)[1]; size_t nb3 = ((int32_t *) dst->op_params)[2]; @@ -10161,7 +10161,7 @@ static void ggml_compute_forward_set_f32( GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); // view src0 and dst with these strides and data offset inbytes during set - // nb0 is implicitely element_size because src0 and dst are contiguous + // nb0 is implicitly element_size because src0 and dst are contiguous size_t nb1 = ((int32_t *) dst->op_params)[0]; size_t nb2 = ((int32_t *) dst->op_params)[1]; size_t nb3 = ((int32_t *) dst->op_params)[2]; @@ -14475,7 +14475,7 @@ void ggml_build_backward_gradient_checkpointing( // insert new tensors recomputing src, reusing already made replacements, // remember replacements: remember new tensors with mapping from corresponding gf nodes // recurse for input tensors, - // unless (i.e. terminating when) input tensors are replacments (like checkpoints) + // unless (i.e. terminating when) input tensors are replacements (like checkpoints) node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]); } // insert rewritten backward node with replacements made into resulting backward graph gb diff --git a/gguf-py/README.md b/gguf-py/README.md index 502b6a510..a27d2fc0e 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -61,7 +61,7 @@ If you want to publish the package manually for any reason, you need to have `tw pip install build twine ``` -Then, folow these steps to release a new version: +Then, follow these steps to release a new version: 1. Bump the version in `pyproject.toml`. 2. Build the package: diff --git a/llama.cpp b/llama.cpp index 93d8f3e16..54fa9e43e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2758,7 +2758,7 @@ static void llm_load_vocab( // The assumption is, since special tokens aren't meant to be exposed to end user, they are designed // to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer // are special tokens. - // From testing, this appears to corelate 1:1 with special tokens. + // From testing, this appears to correlate 1:1 with special tokens. // // Counting special tokens and verifying in only one direction @@ -5846,7 +5846,7 @@ static int llama_decode_internal( const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; - // helpers for smoother batch API transistion + // helpers for smoother batch API transition // after deprecating the llama_eval calls, these will be removed std::vector pos; @@ -6625,12 +6625,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< // loop over the text while (true) { - // find the first occurence of a given special token in this fragment + // find the first occurrence of a given special token in this fragment // passing offset argument only limit the "search area" but match coordinates // are still relative to the source full raw_text auto match = raw_text->find(special_token, raw_text_base_offset); - // no occurences found, stop processing this fragment for a given special token + // no occurrences found, stop processing this fragment for a given special token if (match == std::string::npos) break; // check if match is within bounds of offset <-> length @@ -7829,7 +7829,7 @@ struct llama_beam_search_data { } // Min-heaps are used to efficiently collect the top-k elements (k=n_beams). - // The repetative patterns below reflect the 2 stages of heaps: + // The repetitive patterns below reflect the 2 stages of heaps: // * Gather elements until the vector is full, then call std::make_heap() on it. // * If the heap is full and a new element is found that should be included, pop the // least element to the back(), replace it with the new, then push it into the heap. diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp index 7fe9154dd..81c20a89c 100644 --- a/tests/test-grad0.cpp +++ b/tests/test-grad0.cpp @@ -1,4 +1,4 @@ -#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows +#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows #include "ggml.h" #include diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 88fac0e23..62d0190f9 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -117,7 +117,7 @@ static void usage(char * argv[]) { printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE); printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE); printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE); - printf(" --op OP set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n"); + printf(" --op OP set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n"); printf(" quantize_row_q_dot, vec_dot_q (all)\n"); printf(" --type TYPE set test type as"); for (int i = 0; i < GGML_TYPE_COUNT; i++) { @@ -202,7 +202,7 @@ int main(int argc, char * argv[]) { } int alignment = std::stoi(argv[i]); if (alignment < 0 || alignment > MAX_ALIGNMENT) { - fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT); + fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT); invalid_param = true; break; } From fecac45658a99eddc4d6e36ba0310ca8f87a77f0 Mon Sep 17 00:00:00 2001 From: kalomaze <66376113+kalomaze@users.noreply.github.com> Date: Tue, 12 Dec 2023 04:12:35 -0600 Subject: [PATCH 8/8] server : tweak default sampling parameters (#4367) * Set a more typical Top P setting as the default * Update temp max --- examples/server/public/index.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 18a6ccf0f..451fd4a3b 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -223,7 +223,7 @@ repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_penalty: 1.18, // 1.0 = disabled top_k: 40, // <= 0 to use vocab size - top_p: 0.5, // 1.0 = disabled + top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled @@ -762,7 +762,7 @@
${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })} - ${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} + ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}