Merge branch 'master' into feat-seqrep-sampler-simple

This commit is contained in:
KerfuffleV2 2023-11-18 04:38:30 -07:00
commit f10956876a
23 changed files with 356 additions and 78 deletions

View file

@ -576,8 +576,12 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected") message(STATUS "PowerPC detected")
add_compile_options(-mcpu=native -mtune=native) if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) add_compile_options(-mcpu=powerpc64le)
else()
add_compile_options(-mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
endif()
else() else()
message(STATUS "Unknown architecture") message(STATUS "Unknown architecture")
endif() endif()

View file

@ -342,6 +342,12 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
endif endif
endif endif
ifneq ($(filter ppc64le%,$(UNAME_M)),)
MK_CFLAGS += -mcpu=powerpc64le
MK_CXXFLAGS += -mcpu=powerpc64le
CUDA_POWER_ARCH = 1
endif
else else
MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
@ -392,6 +398,8 @@ else
endif #LLAMA_CUDA_NVCC endif #LLAMA_CUDA_NVCC
ifdef CUDA_DOCKER_ARCH ifdef CUDA_DOCKER_ARCH
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else ifdef CUDA_POWER_ARCH
NVCCFLAGS +=
else else
NVCCFLAGS += -arch=native NVCCFLAGS += -arch=native
endif # CUDA_DOCKER_ARCH endif # CUDA_DOCKER_ARCH

View file

@ -1220,6 +1220,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) { if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
data_str = "\"" + data_str + "\""; data_str = "\"" + data_str + "\"";
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
return; return;

View file

@ -1136,6 +1136,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2); fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip); fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f); fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
fprintf(stderr, " -ngl N, --n-gpu-layers N Number of model layers to offload to GPU (default %d)", params->n_gpu_layers);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
@ -1355,6 +1356,17 @@ bool consume_common_train_arg(
return true; return true;
} }
params->adam_gclip = std::stof(argv[i]); params->adam_gclip = std::stof(argv[i]);
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
*invalid_param = true;
return true;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params->n_gpu_layers = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else if (arg == "-h" || arg == "--help") { } else if (arg == "-h" || arg == "--help") {
params->print_usage = true; params->print_usage = true;
return true; return true;

View file

@ -6,11 +6,9 @@ from __future__ import annotations
import argparse import argparse
import json import json
import os import os
import struct
import sys import sys
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any from typing import TYPE_CHECKING, Any
import itertools
import numpy as np import numpy as np
import torch import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import] from sentencepiece import SentencePieceProcessor # type: ignore[import]

View file

@ -193,7 +193,7 @@ class Model:
return gguf.MODEL_ARCH.MPT return gguf.MODEL_ARCH.MPT
if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
return gguf.MODEL_ARCH.BAICHUAN return gguf.MODEL_ARCH.BAICHUAN
if arch == "FalconForCausalLM": if arch in ("FalconForCausalLM", "RWForCausalLM"):
return gguf.MODEL_ARCH.FALCON return gguf.MODEL_ARCH.FALCON
if arch == "GPTBigCodeForCausalLM": if arch == "GPTBigCodeForCausalLM":
return gguf.MODEL_ARCH.STARCODER return gguf.MODEL_ARCH.STARCODER

View file

@ -2,7 +2,6 @@
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import math
import struct import struct
import sys import sys
from enum import IntEnum from enum import IntEnum

View file

@ -690,6 +690,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
data_base_path=pickle_paths[0][:-4], data_base_path=pickle_paths[0][:-4],
zip_file=zf) zip_file=zf)
model = unpickler.load() model = unpickler.load()
if 'model' in model: model = model['model']
as_dict = dict(model.items()) as_dict = dict(model.items())
return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None) return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)

View file

@ -24,6 +24,7 @@ else()
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(llava) add_subdirectory(llava)
add_subdirectory(main) add_subdirectory(main)
add_subdirectory(tokenize)
add_subdirectory(parallel) add_subdirectory(parallel)
add_subdirectory(perplexity) add_subdirectory(perplexity)
add_subdirectory(quantize) add_subdirectory(quantize)

View file

@ -3,9 +3,7 @@
import argparse import argparse
import gguf import gguf
import os
import struct import struct
import sys
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path

View file

@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
randomize_tensor_normal(lora->tok_embeddings_a, rnd); randomize_tensor_normal(lora->tok_embeddings_a, rnd);
randomize_tensor_normal(lora->tok_embeddings_b, rnd); ggml_set_zero(lora->tok_embeddings_b);
randomize_tensor_normal(lora->norm_a, rnd); randomize_tensor_normal(lora->norm_a, rnd);
randomize_tensor_normal(lora->norm_b, rnd); ggml_set_zero(lora->norm_b);
randomize_tensor_normal(lora->output_a, rnd); randomize_tensor_normal(lora->output_a, rnd);
randomize_tensor_normal(lora->output_b, rnd); ggml_set_zero(lora->output_b);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = lora->layers[i]; auto & layer = lora->layers[i];
randomize_tensor_normal(layer.attention_norm_a, rnd); randomize_tensor_normal(layer.attention_norm_a, rnd);
randomize_tensor_normal(layer.attention_norm_b, rnd); ggml_set_zero(layer.attention_norm_b);
randomize_tensor_normal(layer.wq_a, rnd); randomize_tensor_normal(layer.wq_a, rnd);
randomize_tensor_normal(layer.wq_b, rnd); ggml_set_zero(layer.wq_b);
randomize_tensor_normal(layer.wk_a, rnd); randomize_tensor_normal(layer.wk_a, rnd);
randomize_tensor_normal(layer.wk_b, rnd); ggml_set_zero(layer.wk_b);
randomize_tensor_normal(layer.wv_a, rnd); randomize_tensor_normal(layer.wv_a, rnd);
randomize_tensor_normal(layer.wv_b, rnd); ggml_set_zero(layer.wv_b);
randomize_tensor_normal(layer.wo_a, rnd); randomize_tensor_normal(layer.wo_a, rnd);
randomize_tensor_normal(layer.wo_b, rnd); ggml_set_zero(layer.wo_b);
randomize_tensor_normal(layer.ffn_norm_a, rnd); randomize_tensor_normal(layer.ffn_norm_a, rnd);
randomize_tensor_normal(layer.ffn_norm_b, rnd); ggml_set_zero(layer.ffn_norm_b);
randomize_tensor_normal(layer.w1_a, rnd); randomize_tensor_normal(layer.w1_a, rnd);
randomize_tensor_normal(layer.w1_b, rnd); ggml_set_zero(layer.w1_b);
randomize_tensor_normal(layer.w2_a, rnd); randomize_tensor_normal(layer.w2_a, rnd);
randomize_tensor_normal(layer.w2_b, rnd); ggml_set_zero(layer.w2_b);
randomize_tensor_normal(layer.w3_a, rnd); randomize_tensor_normal(layer.w3_a, rnd);
randomize_tensor_normal(layer.w3_b, rnd); ggml_set_zero(layer.w3_b);
} }
free_random_normal_distribution(rnd); free_random_normal_distribution(rnd);
@ -1460,17 +1460,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
} }
params->n_rank_w3 = std::stoi(argv[i]); params->n_rank_w3 = std::stoi(argv[i]);
params->custom_n_rank_w3 = true; params->custom_n_rank_w3 = true;
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params->common.n_gpu_layers = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else { } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
train_print_usage(argc, argv, &default_params); train_print_usage(argc, argv, &default_params);

View file

@ -127,7 +127,14 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
fclose(file); fclose(file);
return false; return false;
} }
fread(buffer, 1, fileSize, file); // Read the file into the buffer errno = 0;
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
if (ferror(file)) {
die_fmt("read error: %s", strerror(errno));
}
if (ret != (size_t) fileSize) {
die("unexpectedly reached end of file");
}
fclose(file); // Close the file fclose(file); // Close the file
*bytesOut = buffer; *bytesOut = buffer;

View file

@ -0,0 +1,5 @@
set(TARGET tokenize)
add_executable(${TARGET} tokenize.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,44 @@
#include "common.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
int main(int argc, char ** argv) {
if (argc < 3 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
return 1;
}
const char * model_path = argv[1];
const char * prompt = argv[2];
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
llama_backend_init(false);
llama_model_params model_params = llama_model_default_params();
model_params.vocab_only = true;
llama_model * model = llama_load_model_from_file(model_path, model_params);
llama_context_params ctx_params = llama_context_default_params();
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
const bool add_bos = true;
std::vector<llama_token> tokens;
tokens = ::llama_tokenize(model, prompt, add_bos, true);
for (int i = 0; i < (int) tokens.size(); i++) {
if (printing_ids) {
printf("%d\n", tokens[i]);
} else {
printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
}
}
return 0;
}

View file

@ -5840,7 +5840,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
return ptr; return ptr;
} }
#ifdef DEBUG_CUDA_MALLOC #ifdef DEBUG_CUDA_MALLOC
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz, fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024)); (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
#endif #endif
void * ptr; void * ptr;
@ -5978,7 +5978,7 @@ void * ggml_cuda_host_malloc(size_t size) {
// The allocation error can be bypassed. A null ptr will assigned out of this function. // The allocation error can be bypassed. A null ptr will assigned out of this function.
// This can fixed the OOM error in WSL. // This can fixed the OOM error in WSL.
cudaGetLastError(); cudaGetLastError();
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
size/1024.0/1024.0, cudaGetErrorString(err)); size/1024.0/1024.0, cudaGetErrorString(err));
return nullptr; return nullptr;
} }
@ -6356,6 +6356,7 @@ static int64_t get_row_rounding(ggml_type type) {
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
return max_compute_capability >= CC_RDNA2 ? 128 : 64; return max_compute_capability >= CC_RDNA2 ? 128 : 64;
case GGML_TYPE_F16: case GGML_TYPE_F16:
case GGML_TYPE_F32:
return 1; return 1;
case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K:
return max_compute_capability >= CC_RDNA2 ? 128 : 32; return max_compute_capability >= CC_RDNA2 ? 128 : 32;
@ -6378,6 +6379,7 @@ static int64_t get_row_rounding(ggml_type type) {
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
return 64; return 64;
case GGML_TYPE_F16: case GGML_TYPE_F16:
case GGML_TYPE_F32:
return 1; return 1;
case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K: case GGML_TYPE_Q3_K:

View file

@ -345,10 +345,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
} }
} }
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MiB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.maxTransferRate != 0) { if (ctx->device.maxTransferRate != 0) {
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MiB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
} else { } else {
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
} }
@ -541,11 +541,11 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) { if (ctx->buffers[ctx->n_buffers].metal == nil) {
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
return false; return false;
} }
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
++ctx->n_buffers; ++ctx->n_buffers;
} else { } else {
@ -565,11 +565,11 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) { if (ctx->buffers[ctx->n_buffers].metal == nil) {
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
return false; return false;
} }
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
if (i + size_step < size) { if (i + size_step < size) {
GGML_METAL_LOG_INFO("\n"); GGML_METAL_LOG_INFO("\n");
} }

View file

@ -19,7 +19,7 @@
#ifdef __wasm_simd128__ #ifdef __wasm_simd128__
#include <wasm_simd128.h> #include <wasm_simd128.h>
#else #else
#ifdef __POWER9_VECTOR__ #if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
#include <altivec.h> #include <altivec.h>
#undef bool #undef bool
#define bool _Bool #define bool _Bool

94
ggml.c
View file

@ -9611,10 +9611,12 @@ static void ggml_compute_forward_out_prod_f32(
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
GGML_ASSERT(ne0 == ne00);
GGML_ASSERT(ne1 == ne10);
GGML_ASSERT(ne2 == ne02);
GGML_ASSERT(ne02 == ne12); GGML_ASSERT(ne02 == ne12);
GGML_ASSERT(ne03 == ne13);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13); GGML_ASSERT(ne3 == ne13);
GGML_ASSERT(ne03 == ne13);
// we don't support permuted src0 or src1 // we don't support permuted src0 or src1
GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float));
@ -9625,18 +9627,25 @@ static void ggml_compute_forward_out_prod_f32(
// GGML_ASSERT(nb1 <= nb2); // GGML_ASSERT(nb1 <= nb2);
// GGML_ASSERT(nb2 <= nb3); // GGML_ASSERT(nb2 <= nb3);
GGML_ASSERT(ne0 == ne00);
GGML_ASSERT(ne1 == ne10);
GGML_ASSERT(ne2 == ne02);
GGML_ASSERT(ne3 == ne03);
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) // TODO: #if defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
bool use_blas = ggml_is_matrix(src0) &&
ggml_is_matrix(src1) &&
ggml_is_contiguous(src0) &&
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
#endif
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
if (use_blas) {
return;
}
#endif
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
return; return;
} }
@ -9645,6 +9654,50 @@ static void ggml_compute_forward_out_prod_f32(
return; return;
} }
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (use_blas) {
if (params->ith != 0) { // All threads other than the first do no work.
return;
}
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
// src0: (k,n)
// src1: (k,m)
// dst: (m,n)
//
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
// Also expressed as (major,minor)
// a: (m,k): so src1 transposed
// b: (k,n): so src0
// c: (m,n)
//
// However, if ggml_is_transposed(src1) is true, then
// src1->data already contains a transposed version, so sgemm mustn't
// transpose it further.
int n = src0->ne[0];
int k = src0->ne[1];
int m = src1->ne[0];
int transposeA, lda;
if (!ggml_is_transposed(src1)) {
transposeA = CblasTrans;
lda = m;
} else {
transposeA = CblasNoTrans;
lda = k;
}
float * a = (float *) ((char *) src1->data);
float * b = (float *) ((char *) src0->data);
float * c = (float *) ((char *) dst->data);
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
return;
}
#endif
// dst[:,:,:,:] = 0 // dst[:,:,:,:] = 0
// for i2,i3: // for i2,i3:
// for i1: // for i1:
@ -18399,24 +18452,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
} }
const char * gguf_get_key(const struct gguf_context * ctx, int key_id) { const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
return ctx->kv[key_id].key.data; return ctx->kv[key_id].key.data;
} }
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) { enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
return ctx->kv[key_id].type; return ctx->kv[key_id].type;
} }
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) { enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
return ctx->kv[key_id].value.arr.type; return ctx->kv[key_id].value.arr.type;
} }
const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) { const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
return ctx->kv[key_id].value.arr.data; return ctx->kv[key_id].value.arr.data;
} }
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) { const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
struct gguf_kv * kv = &ctx->kv[key_id]; struct gguf_kv * kv = &ctx->kv[key_id];
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
@ -18424,70 +18482,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
} }
int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) { int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
return ctx->kv[key_id].value.arr.n; return ctx->kv[key_id].value.arr.n;
} }
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) { uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
return ctx->kv[key_id].value.uint8; return ctx->kv[key_id].value.uint8;
} }
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) { int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
return ctx->kv[key_id].value.int8; return ctx->kv[key_id].value.int8;
} }
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) { uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
return ctx->kv[key_id].value.uint16; return ctx->kv[key_id].value.uint16;
} }
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) { int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
return ctx->kv[key_id].value.int16; return ctx->kv[key_id].value.int16;
} }
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) { uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
return ctx->kv[key_id].value.uint32; return ctx->kv[key_id].value.uint32;
} }
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) { int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
return ctx->kv[key_id].value.int32; return ctx->kv[key_id].value.int32;
} }
float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) { float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
return ctx->kv[key_id].value.float32; return ctx->kv[key_id].value.float32;
} }
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) { uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
return ctx->kv[key_id].value.uint64; return ctx->kv[key_id].value.uint64;
} }
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) { int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
return ctx->kv[key_id].value.int64; return ctx->kv[key_id].value.int64;
} }
double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) { double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
return ctx->kv[key_id].value.float64; return ctx->kv[key_id].value.float64;
} }
bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) { bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
return ctx->kv[key_id].value.bool_; return ctx->kv[key_id].value.bool_;
} }
const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) { const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING); GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
return ctx->kv[key_id].value.str.data; return ctx->kv[key_id].value.str.data;
} }
const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
return &ctx->kv[key_id].value;
}
int gguf_get_n_tensors(const struct gguf_context * ctx) { int gguf_get_n_tensors(const struct gguf_context * ctx) {
return ctx->header.n_tensors; return ctx->header.n_tensors;
} }

1
ggml.h
View file

@ -2045,6 +2045,7 @@ extern "C" {
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id); GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id); GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id); GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id); GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);

165
llama.cpp
View file

@ -91,7 +91,7 @@
#define LLAMA_ATTRIBUTE_FORMAT(...) #define LLAMA_ATTRIBUTE_FORMAT(...)
#endif #endif
#define LLAMA_MAX_NODES 4096 #define LLAMA_MAX_NODES 8192
// //
// logging // logging
@ -604,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
return LLAMA_ROPE_SCALING_UNSPECIFIED; return LLAMA_ROPE_SCALING_UNSPECIFIED;
} }
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
switch (type) {
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
default: return format("unknown type %d", type);
}
}
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
// escape quotes
replace_all(val, "\\", "\\\\");
replace_all(val, "\"", "\\\"");
ss << '"' << val << '"';
} else if (arr_type == GGUF_TYPE_ARRAY) {
ss << "???";
} else {
ss << gguf_data_to_str(arr_type, data, j);
}
if (j < arr_n - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
default:
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
}
}
// //
// ggml helpers // ggml helpers
// //
@ -1087,9 +1141,9 @@ enum e_model {
MODEL_70B, MODEL_70B,
}; };
static const size_t kB = 1024; static const size_t kiB = 1024;
static const size_t MB = 1024*kB; static const size_t MiB = 1024*kiB;
static const size_t GB = 1024*MB; static const size_t GiB = 1024*MiB;
struct llama_hparams { struct llama_hparams {
bool vocab_only; bool vocab_only;
@ -1327,6 +1381,9 @@ struct llama_model {
int n_gpu_layers; int n_gpu_layers;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
// context // context
struct ggml_context * ctx = NULL; struct ggml_context * ctx = NULL;
@ -1488,7 +1545,7 @@ static bool llama_kv_cache_init(
vram_kv_cache += ggml_nbytes(cache.k); vram_kv_cache += ggml_nbytes(cache.k);
} }
if (vram_kv_cache > 0) { if (vram_kv_cache > 0) {
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
} }
} }
#endif #endif
@ -1785,10 +1842,10 @@ struct llama_model_loader {
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
default: default:
{ {
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
ftype = LLAMA_FTYPE_ALL_F32; ftype = LLAMA_FTYPE_ALL_F32;
} break; } break;
} }
// this is a way to mark that we have "guessed" the file type // this is a way to mark that we have "guessed" the file type
@ -1802,10 +1859,20 @@ struct llama_model_loader {
} }
for (int i = 0; i < n_kv; i++) { for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx_gguf, i); const char * name = gguf_get_key(ctx_gguf, i);
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
: gguf_type_name(type);
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type)); std::string value = gguf_kv_to_str(ctx_gguf, i);
const size_t MAX_VALUE_LEN = 40;
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
}
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
} }
// print type counts // print type counts
@ -2100,6 +2167,17 @@ static void llm_load_hparams(
auto & hparams = model.hparams; auto & hparams = model.hparams;
// get metadata as string
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
enum gguf_type type = gguf_get_kv_type(ctx, i);
if (type == GGUF_TYPE_ARRAY) {
continue;
}
const char * name = gguf_get_key(ctx, i);
const std::string value = gguf_kv_to_str(ctx, i);
model.gguf_kv.emplace(name, value);
}
// get general kv // get general kv
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME)); GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
@ -2543,8 +2621,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
if (ml.n_bytes < GB) { if (ml.n_bytes < GiB) {
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
} else { } else {
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
} }
@ -2582,7 +2660,7 @@ static void llm_load_tensors(
ml.calc_sizes(ctx_size, mmapped_size); ml.calc_sizes(ctx_size, mmapped_size);
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
// create the ggml context // create the ggml context
{ {
@ -3231,7 +3309,7 @@ static void llm_load_tensors(
ctx_size + ctx_size +
mmapped_size - vram_weights; // weights in VRAM not in memory mmapped_size - vram_weights; // weights in VRAM not in memory
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0); LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@ -3250,7 +3328,7 @@ static void llm_load_tensors(
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0); LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
#else #else
(void) n_gpu_layers; (void) n_gpu_layers;
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
@ -7962,7 +8040,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
workers.clear(); workers.clear();
} }
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
int64_t tot_count = 0; int64_t tot_count = 0;
for (size_t i = 0; i < hist_cur.size(); i++) { for (size_t i = 0; i < hist_cur.size(); i++) {
hist_all[i] += hist_cur[i]; hist_all[i] += hist_cur[i];
@ -8502,7 +8580,7 @@ struct llama_context * llama_new_context_with_model(
{ {
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
} }
// resized during inference // resized during inference
@ -8547,7 +8625,7 @@ struct llama_context * llama_new_context_with_model(
// measure memory requirements for the graph // measure memory requirements for the graph
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
// recreate allocator with exact memory requirements // recreate allocator with exact memory requirements
ggml_allocr_free(ctx->alloc); ggml_allocr_free(ctx->alloc);
@ -8561,7 +8639,7 @@ struct llama_context * llama_new_context_with_model(
#endif #endif
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
ggml_cuda_set_scratch_size(alloc_size); ggml_cuda_set_scratch_size(alloc_size);
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0); LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
// calculate total VRAM usage // calculate total VRAM usage
auto add_tensor = [](const ggml_tensor * t, size_t & size) { auto add_tensor = [](const ggml_tensor * t, size_t & size) {
@ -8581,10 +8659,10 @@ struct llama_context * llama_new_context_with_model(
size_t ctx_vram_size = alloc_size + kv_vram_size; size_t ctx_vram_size = alloc_size + kv_vram_size;
size_t total_vram_size = model_vram_size + ctx_vram_size; size_t total_vram_size = model_vram_size + ctx_vram_size;
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__, LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
total_vram_size / 1024.0 / 1024.0, total_vram_size / 1024.0 / 1024.0,
model_vram_size / 1024.0 / 1024.0, model_vram_size / 1024.0 / 1024.0,
ctx_vram_size / 1024.0 / 1024.0); ctx_vram_size / 1024.0 / 1024.0);
#endif #endif
} }
@ -8605,7 +8683,7 @@ struct llama_context * llama_new_context_with_model(
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
#define LLAMA_METAL_CHECK_BUF(result) \ #define LLAMA_METAL_CHECK_BUF(result) \
if (!(result)) { \ if (!(result)) { \
@ -8671,6 +8749,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
return model->hparams.rope_freq_scale_train; return model->hparams.rope_freq_scale_train;
} }
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
const auto & it = model->gguf_kv.find(key);
if (it == model->gguf_kv.end()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
int llama_model_meta_count(const struct llama_model * model) {
return (int)model->gguf_kv.size();
}
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)model->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = model->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->first.c_str());
}
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)model->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = model->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
return snprintf(buf, buf_size, "%s %s %s", return snprintf(buf, buf_size, "%s %s %s",
llama_model_arch_name(model->arch).c_str(), llama_model_arch_name(model->arch).c_str(),

17
llama.h
View file

@ -301,6 +301,23 @@ extern "C" {
// Get the model's RoPE frequency scaling factor // Get the model's RoPE frequency scaling factor
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
// Functions to access the model's GGUF metadata scalar values
// - The functions return the length of the string on success, or -1 on failure
// - The output string is always null-terminated and cleared on failure
// - GGUF array values are not supported by these functions
// Get metadata value as a string by key name
LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
// Get the number of metadata key/value pairs
LLAMA_API int llama_model_meta_count(const struct llama_model * model);
// Get metadata key name by index
LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
// Get metadata value as a string by index
LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
// Get a string describing the model type // Get a string describing the model type
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);

View file

@ -1,7 +1,5 @@
# tests with BPE tokenizer # tests with BPE tokenizer
import os
import sys
import argparse import argparse
from transformers import AutoTokenizer from transformers import AutoTokenizer

View file

@ -1,7 +1,5 @@
# tests with SPM tokenizer # tests with SPM tokenizer
import os
import sys
import argparse import argparse
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor