Merge remote-tracking branch 'origin/master' into tool-call

2025-01-21 13:44:58 +00:00 · 2025-01-21 13:44:58 +00:00 · fec0260366
commit fec0260366
parent c606255948 6171c9d258
6 changed files with 190 additions and 165 deletions
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -345,8 +345,18 @@ struct lora_merge_ctx {
            gf = ggml_new_graph(ctx0);
            struct ggml_tensor * cur = inp_base;
            for (size_t i = 0; i < adapters.size(); ++i) {
-                struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
-                struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
+                struct ggml_tensor * delta;
+                bool is_tok_embd = string_starts_with(name_base, "token_embd");
+                if (is_tok_embd) {
+                    printf("%s :     detected token embeddings tensor\n", __func__);
+                    delta = ggml_mul_mat(ctx0,
+                        ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32),
+                        ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32));
+                } else {
+                    delta = ggml_mul_mat(ctx0,
+                        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))),
+                        ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
+                }
                // scale
                const float alpha = adapters[i]->alpha;
                const float rank  = (float) inp_b[i]->ne[0];
--- a/examples/run/linenoise.cpp/linenoise.cpp
+++ b/examples/run/linenoise.cpp/linenoise.cpp
@ -103,24 +103,26 @@
 *
 */

-#include <termios.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <errno.h>
-#include <string.h>
-#include <stdlib.h>
+#    include "linenoise.h"
+
 #    include <ctype.h>
+#    include <errno.h>
+#    include <stdio.h>
+#    include <string.h>
+#    include <sys/file.h>
+#    include <sys/ioctl.h>
 #    include <sys/stat.h>
 #    include <sys/types.h>
-#include <sys/ioctl.h>
+#    include <termios.h>
 #    include <unistd.h>
+
+#    include <memory>
+#    include <string>
 #    include <vector>
-#include "linenoise.h"

 #    define LINENOISE_DEFAULT_HISTORY_MAX_LEN 100
 #    define LINENOISE_MAX_LINE                4096
-static std::vector<const char*> unsupported_term = {"dumb","cons25","emacs",nullptr};
+static std::vector<const char *>    unsupported_term   = { "dumb", "cons25", "emacs" };
 static linenoiseCompletionCallback *completionCallback = NULL;
 static linenoiseHintsCallback *hintsCallback = NULL;
 static linenoiseFreeHintsCallback *freeHintsCallback = NULL;
@ -166,21 +168,58 @@ int linenoiseHistoryAdd(const char *line);
 #define REFRESH_ALL (REFRESH_CLEAN|REFRESH_WRITE) // Do both.
 static void refreshLine(struct linenoiseState *l);

+class File {
+  public:
+    FILE * file = nullptr;
+
+    FILE * open(const std::string & filename, const char * mode) {
+        file = fopen(filename.c_str(), mode);
+
+        return file;
+    }
+
+    int lock() {
+        if (file) {
+            fd = fileno(file);
+            if (flock(fd, LOCK_EX | LOCK_NB) != 0) {
+                fd = -1;
+
+                return 1;
+            }
+        }
+
+        return 0;
+    }
+
+    ~File() {
+        if (fd >= 0) {
+            flock(fd, LOCK_UN);
+        }
+
+        if (file) {
+            fclose(file);
+        }
+    }
+
+  private:
+    int fd = -1;
+};
+
 __attribute__((format(printf, 1, 2)))
 /* Debugging function. */
 #if 0
 static void lndebug(const char *fmt, ...) {
-    static FILE *lndebug_fp = NULL;
-    if (lndebug_fp == NULL) {
-        lndebug_fp = fopen("/tmp/lndebug.txt", "a");
+    static File file;
+    if (file.file == nullptr) {
+        file.open("/tmp/lndebug.txt", "a");
    }

-    if (lndebug_fp != NULL) {
+    if (file.file != nullptr) {
        va_list args;
        va_start(args, fmt);
-        vfprintf(lndebug_fp, fmt, args);
+        vfprintf(file.file, fmt, args);
        va_end(args);
-        fflush(lndebug_fp);
+        fflush(file.file);
    }
 }
 #else
@ -213,8 +252,11 @@ void linenoiseSetMultiLine(int ml) {
 static int isUnsupportedTerm(void) {
    char *term = getenv("TERM");
    if (term == NULL) return 0;
-    for (int j = 0; unsupported_term[j]; ++j)
-        if (!strcasecmp(term, unsupported_term[j])) return 1;
+    for (size_t j = 0; j < unsupported_term.size(); ++j) {
+        if (!strcasecmp(term, unsupported_term[j])) {
+            return 1;
+        }
+    }
    return 0;
 }

@ -334,17 +376,6 @@ static void linenoiseBeep(void) {
    fflush(stderr);
 }

-/* ============================== Completion ================================ */
-
-/* Free a list of completion option populated by linenoiseAddCompletion(). */
-static void freeCompletions(linenoiseCompletions *lc) {
-    size_t i;
-    for (i = 0; i < lc->len; i++)
-        free(lc->cvec[i]);
-    if (lc->cvec != NULL)
-        free(lc->cvec);
-}
-
 /* Called by completeLine() and linenoiseShow() to render the current
 * edited line with the proposed completion. If the current completion table
 * is already available, it is passed as second argument, otherwise the
@ -353,7 +384,7 @@ static void freeCompletions(linenoiseCompletions *lc) {
 * Flags are the same as refreshLine*(), that is REFRESH_* macros. */
 static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseCompletions *lc, int flags) {
    /* Obtain the table of completions if the caller didn't provide one. */
-    linenoiseCompletions ctable = { 0, NULL };
+    linenoiseCompletions ctable;
    if (lc == NULL) {
        completionCallback(ls->buf, &ctable);
        lc = &ctable;
@ -372,8 +403,9 @@ static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseComple
        refreshLineWithFlags(ls, flags);
    }

-    /* Free the completions table if needed. */
-    if (lc != &ctable) freeCompletions(&ctable);
+    if (lc == &ctable) {
+        ctable.to_free = false;
+    }
 }

 /* This is an helper function for linenoiseEdit*() and is called when the
@ -391,7 +423,7 @@ static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseComple
 * possible completions, and the caller should read for the next characters
 * from stdin. */
 static int completeLine(struct linenoiseState *ls, int keypressed) {
-    linenoiseCompletions lc = { 0, NULL };
+    linenoiseCompletions lc;
    int nwritten;
    char c = keypressed;

@ -420,8 +452,7 @@ static int completeLine(struct linenoiseState *ls, int keypressed) {
            default:
                /* Update buffer and return */
                if (ls->completion_idx < lc.len) {
-                    nwritten = snprintf(ls->buf,ls->buflen,"%s",
-                        lc.cvec[ls->completion_idx]);
+                    nwritten = snprintf(ls->buf, ls->buflen, "%s", lc.cvec[ls->completion_idx]);
                    ls->len = ls->pos = nwritten;
                }
                ls->in_completion = 0;
@ -436,7 +467,6 @@ static int completeLine(struct linenoiseState *ls, int keypressed) {
        }
    }

-    freeCompletions(&lc);
    return c; /* Return last read character */
 }

@ -462,53 +492,25 @@ void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *fn) {
 * user typed <tab>. See the example.c source code for a very easy to
 * understand example. */
 void linenoiseAddCompletion(linenoiseCompletions *lc, const char *str) {
-    size_t len = strlen(str);
-    char *copy, **cvec;
-
-    copy = (char*) malloc(len + 1);
-    if (copy == NULL) return;
-    memcpy(copy,str,len+1);
-    cvec = (char**) realloc(lc->cvec,sizeof(char*)*(lc->len+1));
-    if (cvec == NULL) {
-        free(copy);
+    const size_t len  = strlen(str);
+    auto         copy = std::make_unique<char[]>(len + 1);
+    if (!copy) {
        return;
    }
+
+    memcpy(copy.get(), str, len + 1);
+    char ** cvec = static_cast<char **>(std::realloc(lc->cvec, sizeof(char *) * (lc->len + 1)));
+    if (cvec == nullptr) {
+        return;
+    }
+
    lc->cvec = cvec;
-    lc->cvec[lc->len++] = copy;
-}
-
-/* =========================== Line editing ================================= */
-
-/* We define a very simple "append buffer" structure, that is an heap
- * allocated string where we can append to. This is useful in order to
- * write all the escape sequences in a buffer and flush them to the standard
- * output in a single call, to avoid flickering effects. */
-struct abuf {
-    char *b;
-    int len;
-};
-
-static void abInit(struct abuf *ab) {
-    ab->b = NULL;
-    ab->len = 0;
-}
-
-static void abAppend(struct abuf *ab, const char *s, int len) {
-    char *new_ptr = (char*) realloc(ab->b,ab->len+len);
-
-    if (new_ptr == NULL) return;
-    memcpy(new_ptr+ab->len,s,len);
-    ab->b = new_ptr;
-    ab->len += len;
-}
-
-static void abFree(struct abuf *ab) {
-    free(ab->b);
+    lc->cvec[lc->len++] = copy.release();
 }

 /* Helper of refreshSingleLine() and refreshMultiLine() to show hints
 * to the right of the prompt. */
-static void refreshShowHints(struct abuf * ab, struct linenoiseState * l, int plen) {
+static void refreshShowHints(std::string & ab, struct linenoiseState * l, int plen) {
    char seq[64];
    if (hintsCallback && plen+l->len < l->cols) {
        int color = -1, bold = 0;
@ -522,10 +524,11 @@ static void refreshShowHints(struct abuf * ab, struct linenoiseState * l, int pl
                snprintf(seq,64,"\033[%d;%d;49m",bold,color);
            else
                seq[0] = '\0';
-            abAppend(ab,seq,strlen(seq));
-            abAppend(ab,hint,hintlen);
+            ab.append(seq);
+            ab.append(hint, hintlen);
            if (color != -1 || bold != 0)
-                abAppend(ab,"\033[0m",4);
+                ab.append("\033[0m");
+
            /* Call the function to free the hint returned. */
            if (freeHintsCallback) freeHintsCallback(hint);
        }
@ -546,8 +549,7 @@ static void refreshSingleLine(struct linenoiseState *l, int flags) {
    char *buf = l->buf;
    size_t len = l->len;
    size_t pos = l->pos;
-    struct abuf ab;
-
+    std::string ab;
    while((plen+pos) >= l->cols) {
        buf++;
        len--;
@ -557,35 +559,34 @@ static void refreshSingleLine(struct linenoiseState *l, int flags) {
        len--;
    }

-    abInit(&ab);
    /* Cursor to left edge */
    snprintf(seq,sizeof(seq),"\r");
-    abAppend(&ab,seq,strlen(seq));
+    ab.append(seq);

    if (flags & REFRESH_WRITE) {
        /* Write the prompt and the current buffer content */
-        abAppend(&ab,l->prompt,strlen(l->prompt));
+        ab.append(l->prompt);
        if (maskmode == 1) {
-            while (len--) abAppend(&ab,"*",1);
+            while (len--) {
+                ab.append("*");
+            }
        } else {
-            abAppend(&ab,buf,len);
+            ab.append(buf, len);
        }
        /* Show hits if any. */
-        refreshShowHints(&ab,l,plen);
+        refreshShowHints(ab, l, plen);
    }

    /* Erase to right */
    snprintf(seq,sizeof(seq),"\x1b[0K");
-    abAppend(&ab,seq,strlen(seq));
-
+    ab.append(seq);
    if (flags & REFRESH_WRITE) {
        /* Move cursor to original position. */
        snprintf(seq,sizeof(seq),"\r\x1b[%dC", (int)(pos+plen));
-        abAppend(&ab,seq,strlen(seq));
+        ab.append(seq);
    }

-    if (write(fd,ab.b,ab.len) == -1) {} /* Can't recover from write error. */
-    abFree(&ab);
+    (void) !write(fd, ab.c_str(), ab.size()); /* Can't recover from write error. */
 }

 /* Multi line low level line refresh.
@ -604,26 +605,23 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
    int col; /* colum position, zero-based. */
    int old_rows = l->oldrows;
    int fd = l->ofd, j;
-    struct abuf ab;
-
+    std::string ab;
    l->oldrows = rows;

    /* First step: clear all the lines used before. To do so start by
     * going to the last row. */
-    abInit(&ab);
-
    if (flags & REFRESH_CLEAN) {
        if (old_rows-rpos > 0) {
            lndebug("go down %d", old_rows-rpos);
            snprintf(seq,64,"\x1b[%dB", old_rows-rpos);
-            abAppend(&ab,seq,strlen(seq));
+            ab.append(seq);
        }

        /* Now for every row clear it, go up. */
        for (j = 0; j < old_rows-1; j++) {
            lndebug("clear+up");
            snprintf(seq,64,"\r\x1b[0K\x1b[1A");
-            abAppend(&ab,seq,strlen(seq));
+            ab.append(seq);
        }
    }

@ -631,21 +629,22 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
        /* Clean the top line. */
        lndebug("clear");
        snprintf(seq,64,"\r\x1b[0K");
-        abAppend(&ab,seq,strlen(seq));
+        ab.append(seq);
    }

    if (flags & REFRESH_WRITE) {
        /* Write the prompt and the current buffer content */
-        abAppend(&ab,l->prompt,strlen(l->prompt));
+        ab.append(l->prompt);
        if (maskmode == 1) {
-            unsigned int i;
-            for (i = 0; i < l->len; i++) abAppend(&ab,"*",1);
+            for (unsigned int i = 0; i < l->len; ++i) {
+                ab.append("*");
+            }
        } else {
-            abAppend(&ab,l->buf,l->len);
+            ab.append(l->buf, l->len);
        }

        /* Show hits if any. */
-        refreshShowHints(&ab,l,plen);
+        refreshShowHints(ab, l, plen);

        /* If we are at the very end of the screen with our prompt, we need to
         * emit a newline and move the prompt to the first column. */
@ -654,9 +653,9 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
            (l->pos+plen) % l->cols == 0)
        {
            lndebug("<newline>");
-            abAppend(&ab,"\n",1);
+            ab.append("\n");
            snprintf(seq,64,"\r");
-            abAppend(&ab,seq,strlen(seq));
+            ab.append(seq);
            rows++;
            if (rows > (int)l->oldrows) l->oldrows = rows;
        }
@ -669,7 +668,7 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
        if (rows-rpos2 > 0) {
            lndebug("go-up %d", rows-rpos2);
            snprintf(seq,64,"\x1b[%dA", rows-rpos2);
-            abAppend(&ab,seq,strlen(seq));
+            ab.append(seq);
        }

        /* Set column. */
@ -679,14 +678,12 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
            snprintf(seq,64,"\r\x1b[%dC", col);
        else
            snprintf(seq,64,"\r");
-        abAppend(&ab,seq,strlen(seq));
+        ab.append(seq);
    }

    lndebug("\n");
    l->oldpos = l->pos;
-
-    if (write(fd,ab.b,ab.len) == -1) {} /* Can't recover from write error. */
-    abFree(&ab);
+    (void) !write(fd, ab.c_str(), ab.size()); /* Can't recover from write error. */
 }

 /* Calls the two low level functions refreshSingleLine() or
@ -1313,16 +1310,17 @@ int linenoiseHistorySetMaxLen(int len) {
 * otherwise -1 is returned. */
 int linenoiseHistorySave(const char *filename) {
    mode_t old_umask = umask(S_IXUSR|S_IRWXG|S_IRWXO);
-    FILE *fp;
-    int j;
-
-    fp = fopen(filename,"w");
+    File   file;
+    file.open(filename, "w");
    umask(old_umask);
-    if (fp == NULL) return -1;
+    if (file.file == NULL) {
+        return -1;
+    }
    chmod(filename,S_IRUSR|S_IWUSR);
-    for (j = 0; j < history_len; j++)
-        fprintf(fp,"%s\n",history[j]);
-    fclose(fp);
+    for (int j = 0; j < history_len; ++j) {
+        fprintf(file.file, "%s\n", history[j]);
+    }
+
    return 0;
 }

@ -1332,12 +1330,14 @@ int linenoiseHistorySave(const char *filename) {
 * If the file exists and the operation succeeded 0 is returned, otherwise
 * on error -1 is returned. */
 int linenoiseHistoryLoad(const char *filename) {
-    FILE *fp = fopen(filename,"r");
+    File file;
+    file.open(filename, "r");
    char buf[LINENOISE_MAX_LINE];
+    if (file.file == NULL) {
+        return -1;
+    }

-    if (fp == NULL) return -1;
-
-    while (fgets(buf,LINENOISE_MAX_LINE,fp) != NULL) {
+    while (fgets(buf, LINENOISE_MAX_LINE, file.file) != NULL) {
        char *p;

        p = strchr(buf,'\r');
@ -1345,7 +1345,6 @@ int linenoiseHistoryLoad(const char *filename) {
        if (p) *p = '\0';
        linenoiseHistoryAdd(buf);
    }
-    fclose(fp);
    return 0;
 }
 #endif
--- a/examples/run/linenoise.cpp/linenoise.h
+++ b/examples/run/linenoise.cpp/linenoise.h
@ -45,6 +45,7 @@ extern "C" {
 #endif

 #include <stddef.h> /* For size_t. */
+#include <stdlib.h>

 extern const char *linenoiseEditMore;

@ -69,10 +70,23 @@ struct linenoiseState {
    int history_index;  /* The history index we are currently editing. */
 };

-typedef struct linenoiseCompletions {
-  size_t len;
-  char **cvec;
-} linenoiseCompletions;
+struct linenoiseCompletions {
+    size_t  len     = 0;
+    char ** cvec    = nullptr;
+    bool    to_free = true;
+
+    ~linenoiseCompletions() {
+        if (!to_free) {
+            return;
+        }
+
+        for (size_t i = 0; i < len; ++i) {
+            free(cvec[i]);
+        }
+
+        free(cvec);
+    }
+};

 /* Non blocking API. */
 int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -3779,9 +3779,9 @@ int main(int argc, char ** argv) {
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "model_path",                  ctx_server.params_base.model },
+            { "chat_template",               ctx_server.chat_templates.template_default->source() },
            { "bos_token",                   ctx_server.chat_templates.template_default->bos_token() },
            { "eos_token",                   ctx_server.chat_templates.template_default->eos_token() },
-            { "chat_template",               ctx_server.chat_templates.template_default->source() },
            { "build_info",                  build_info },
        };
        if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@ -4416,7 +4416,6 @@ void kernel_mul_mv_q2_K_f32_impl(
        device const half     * dh = &x[ib].d;

        for (int row = 0; row < N_DST; row++) {
-
            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
            for (int i = 0; i < 8; i += 2) {
@ -4447,7 +4446,7 @@ void kernel_mul_mv_q2_K_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < N_DST; ++row) {
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = all_sum;
@ -4613,7 +4612,7 @@ void kernel_mul_mv_q3_K_f32_impl(
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

    if (tiisg == 0) {
-        for (int row = 0; row < 2; ++row) {
+        for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
            dst_f32[first_row + row] = sumf1[row];
        }
    }
@ -4729,7 +4728,7 @@ void kernel_mul_mv_q4_K_f32_impl(

    device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;

-    for (int row = 0; row < N_DST; ++row) {
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = all_sum;
@ -4861,7 +4860,7 @@ void kernel_mul_mv_q5_K_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < 2; ++row) {
+    for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
        const float tot = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = tot;
@ -4906,6 +4905,10 @@ void kernel_mul_mv_q6_K_f32_impl(

    const int row = 2*r0 + sgitg;

+    if (row >= args.ne0) {
+        return;
+    }
+
    const uint i12 = im%args.ne12;
    const uint i13 = im/args.ne12;

@ -5061,7 +5064,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < N_DST; ++row) {
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = all_sum * 0.25f;
@ -5179,7 +5182,7 @@ void kernel_mul_mv_iq2_xs_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < N_DST; ++row) {
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = all_sum * 0.25f;
@ -5289,7 +5292,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < N_DST; ++row) {
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = all_sum * 0.5f;
@ -5401,7 +5404,7 @@ void kernel_mul_mv_iq3_s_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < N_DST; ++row) {
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = all_sum;
@ -5514,7 +5517,7 @@ void kernel_mul_mv_iq2_s_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < N_DST; ++row) {
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = all_sum * 0.25f;
@ -5614,7 +5617,7 @@ void kernel_mul_mv_iq1_s_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < N_DST; ++row) {
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = all_sum;
@ -5709,7 +5712,7 @@ void kernel_mul_mv_iq1_m_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < N_DST; ++row) {
+    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = all_sum;
@ -5799,7 +5802,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < 2 && first_row + row < args.ne01; ++row) {
+    for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = all_sum;
@ -5888,7 +5891,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(

    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;

-    for (int row = 0; row < 2; ++row) {
+    for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst_f32[first_row + row] = all_sum;
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@ -181,7 +181,7 @@ struct ggml_backend_rpc_context {

 struct ggml_backend_rpc_buffer_context {
    std::shared_ptr<socket_t> sock;
-    std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
+    void * base_ptr;
    uint64_t remote_ptr;
 };

@ -423,16 +423,15 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {

 static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
-    if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) {
-        return ctx->base_cache[buffer];
+    if (ctx->base_ptr != nullptr) {
+        return ctx->base_ptr;
    }
    rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
    rpc_msg_buffer_get_base_rsp response;
    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
    GGML_ASSERT(status);
-    void * base_ptr = reinterpret_cast<void *>(response.base_ptr);
-    ctx->base_cache[buffer] = base_ptr;
-    return base_ptr;
+    ctx->base_ptr = reinterpret_cast<void *>(response.base_ptr);
+    return ctx->base_ptr;
 }

 static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
@ -557,7 +556,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
    if (response.remote_ptr != 0) {
        ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
            ggml_backend_rpc_buffer_interface,
-            new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr},
+            new ggml_backend_rpc_buffer_context{sock, nullptr, response.remote_ptr},
            response.remote_size);
        return buffer;
    } else {