diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7be63925f..c0c9edd56 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -88,6 +88,8 @@ #define CC_OFFSET_AMD 1000000 #define CC_RDNA2 (CC_OFFSET_AMD + 1030) +#define GGML_CUDA_MAX_NODES 8192 + // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant // for large computational tasks. the drawback is that this requires some extra amount of VRAM: @@ -7727,7 +7729,7 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi); } -void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col); } @@ -7842,11 +7844,11 @@ static size_t g_temp_tensor_extra_index = 0; static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { if (g_temp_tensor_extras == nullptr) { - g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE]; + g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; } size_t alloc_index = g_temp_tensor_extra_index; - g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE; + g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; memset(extra, 0, sizeof(*extra)); @@ -8173,11 +8175,11 @@ struct ggml_backend_buffer_context_cuda { ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { if (temp_tensor_extras == nullptr) { - temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE]; + temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; } size_t alloc_index = temp_tensor_extra_index; - temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE; + temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; memset(extra, 0, sizeof(*extra)); diff --git a/llama.cpp b/llama.cpp index 2bd5a4973..c4c18c3b8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6248,7 +6248,10 @@ static std::vector llama_tokenize_internal(const llama_vocab & // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer // and passing 'add space prefix' as bool argument // - auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length); + auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); + if (&fragment == &fragment_buffer.front()) { + raw_text = " " + raw_text; // prefix with space if the first token is not special + } #ifdef PRETOKENIZERDEBUG fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());