Merge branch 'master' into concedo
# Conflicts: # Makefile
This commit is contained in:
commit
0b904e12db
10 changed files with 430 additions and 33 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -19,6 +19,7 @@ models/*
|
|||
|
||||
/main
|
||||
/quantize
|
||||
/quantize-stats
|
||||
/result
|
||||
/perplexity
|
||||
/embedding
|
||||
|
|
10
Makefile
10
Makefile
|
@ -170,7 +170,7 @@ gpttype_adapter.o:
|
|||
$(CXX) $(CXXFLAGS) -c gpttype_adapter.cpp -o gpttype_adapter.o
|
||||
|
||||
clean:
|
||||
rm -vf *.o main quantize perplexity embedding main.exe quantize.exe koboldcpp.dll koboldcpp_blas.dll gptj.exe
|
||||
rm -vf *.o main quantize quantize-stats perplexity embedding main.exe quantize.exe koboldcpp.dll koboldcpp_blas.dll gptj.exe gpt2.exe
|
||||
|
||||
main: examples/main/main.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
|
||||
|
@ -187,6 +187,10 @@ llamalib_blas: ggml_blas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_a
|
|||
quantize: examples/quantize/quantize.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
|
||||
|
||||
|
||||
quantize_gptj: ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) otherarch/gptj_quantize.cpp ggml.o llama.o -o quantize_gptj $(LDFLAGS)
|
||||
|
||||
|
@ -198,6 +202,10 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
|||
|
||||
gpt2: ggml.o
|
||||
$(CXX) $(CXXFLAGS) otherarch/gpt2_v2.cpp otherarch/utils.cpp ggml.o -o gpt2 $(LDFLAGS)
|
||||
|
||||
libllama.so: llama.o ggml.o
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
|
||||
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
|
|
@ -3,12 +3,14 @@ const std = @import("std");
|
|||
pub fn build(b: *std.Build) void {
|
||||
const target = b.standardTargetOptions(.{});
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
const want_lto = b.option(bool, "lto", "Want -fLTO");
|
||||
|
||||
const lib = b.addStaticLibrary(.{
|
||||
.name = "llama",
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
lib.want_lto = want_lto;
|
||||
lib.linkLibCpp();
|
||||
lib.addIncludePath(".");
|
||||
lib.addIncludePath("examples");
|
||||
|
@ -17,11 +19,11 @@ pub fn build(b: *std.Build) void {
|
|||
}, &.{"-std=c11"});
|
||||
lib.addCSourceFiles(&.{
|
||||
"llama.cpp",
|
||||
"examples/common.cpp",
|
||||
}, &.{"-std=c++11"});
|
||||
lib.install();
|
||||
|
||||
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize };
|
||||
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
|
||||
|
||||
const exe = build_example("main", build_args);
|
||||
_ = build_example("quantize", build_args);
|
||||
_ = build_example("perplexity", build_args);
|
||||
|
@ -44,16 +46,19 @@ fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjS
|
|||
const lib = args.lib;
|
||||
const target = args.target;
|
||||
const optimize = args.optimize;
|
||||
const want_lto = args.want_lto;
|
||||
|
||||
const exe = b.addExecutable(.{
|
||||
.name = name,
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
exe.want_lto = want_lto;
|
||||
exe.addIncludePath(".");
|
||||
exe.addIncludePath("examples");
|
||||
exe.addCSourceFiles(&.{
|
||||
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
|
||||
"examples/common.cpp",
|
||||
}, &.{"-std=c++11"});
|
||||
exe.linkLibrary(lib);
|
||||
exe.install();
|
||||
|
|
|
@ -31,6 +31,7 @@ if (EMSCRIPTEN)
|
|||
else()
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(embedding)
|
||||
endif()
|
||||
|
|
4
examples/quantize-stats/CMakeLists.txt
Normal file
4
examples/quantize-stats/CMakeLists.txt
Normal file
|
@ -0,0 +1,4 @@
|
|||
set(TARGET quantize-stats)
|
||||
add_executable(${TARGET} quantize-stats.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
355
examples/quantize-stats/quantize-stats.cpp
Normal file
355
examples/quantize-stats/quantize-stats.cpp
Normal file
|
@ -0,0 +1,355 @@
|
|||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <numeric>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
|
||||
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
|
||||
|
||||
struct quantize_stats_params {
|
||||
std::string model = "models/7B/ggml-model-f16.bin";
|
||||
bool verbose = false;
|
||||
bool per_layer_stats = false;
|
||||
bool print_histogram = false;
|
||||
bool reference = false;
|
||||
std::vector<std::string> include_layers;
|
||||
std::vector<std::string> exclude_layers;
|
||||
std::vector<enum ggml_type> include_types;
|
||||
};
|
||||
|
||||
const int64_t SCRATCH_ELEMENTS = 32*32;
|
||||
const size_t HISTOGRAM_BUCKETS = 150;
|
||||
const double HISTOGRAM_RANGE = 0.03;
|
||||
|
||||
struct error_stats {
|
||||
size_t num_samples;
|
||||
double total_error;
|
||||
double max_error;
|
||||
uint64_t error_histogram[HISTOGRAM_BUCKETS];
|
||||
};
|
||||
|
||||
|
||||
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
||||
quantize_stats_params params;
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, " -r, --reference\n");
|
||||
fprintf(stderr, " use reference implementation (default: false)\n");
|
||||
fprintf(stderr, " -v, --verbose\n");
|
||||
fprintf(stderr, " verbose output (default: false)\n");
|
||||
fprintf(stderr, " -p, --per-layer-stats\n");
|
||||
fprintf(stderr, " print stats per layer (default: false)\n");
|
||||
fprintf(stderr, " --histogram\n");
|
||||
fprintf(stderr, " print error histogram (default: false)\n");
|
||||
fprintf(stderr, " -l LAYER, --include-layer LAYER\n");
|
||||
fprintf(stderr, " only test layers matching pattern\n");
|
||||
fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n");
|
||||
fprintf(stderr, " exclude layers matching pattern\n");
|
||||
fprintf(stderr, " -t TYPE, --type TYPE\n");
|
||||
fprintf(stderr, " only test given type (q4_0, q4_1)\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
// Check if a layer is included/excluded by command line
|
||||
bool layer_included(const quantize_stats_params params, const std::string & layer) {
|
||||
for (const auto& excluded : params.exclude_layers) {
|
||||
if (std::regex_search(layer, std::regex(excluded))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
for (const auto& included : params.include_layers) {
|
||||
if (std::regex_search(layer, std::regex(included))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return params.include_layers.empty();
|
||||
}
|
||||
|
||||
// Update error statistics given vectors with the before/after result of quantization
|
||||
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
|
||||
for (int64_t i = 0; i < nelements; i++) {
|
||||
double diff = input[i] - output[i];
|
||||
stats.total_error += diff * diff;
|
||||
stats.max_error = fmax(fabs(diff), stats.max_error);
|
||||
stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
|
||||
}
|
||||
stats.num_samples += nelements;
|
||||
}
|
||||
|
||||
double find_quantile(const error_stats & stats, double quantile) {
|
||||
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
|
||||
|
||||
double accum = 0;
|
||||
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
|
||||
accum += stats.error_histogram[i];
|
||||
if (accum >= sum*quantile) {
|
||||
return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
|
||||
}
|
||||
}
|
||||
return INFINITY;
|
||||
}
|
||||
|
||||
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
|
||||
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
|
||||
double median = find_quantile(stats, .5);
|
||||
double pct95 = find_quantile(stats, .95);
|
||||
printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
|
||||
if (print_histogram) {
|
||||
printf("Error distribution:\n");
|
||||
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
|
||||
double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
|
||||
double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
|
||||
if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
|
||||
printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// copied from ggml.h - verify that we can access this as a flat array
|
||||
static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return
|
||||
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
||||
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
||||
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||
}
|
||||
|
||||
// Run quantization function for a single layer and update error stats
|
||||
void test_roundtrip_on_layer(
|
||||
std::string & name,
|
||||
bool print_layer_stats,
|
||||
const quantize_fns_t & qfns,
|
||||
bool use_reference,
|
||||
const ggml_tensor * layer,
|
||||
float * input_scratch,
|
||||
char *quantized_scratch,
|
||||
float * output_scratch,
|
||||
error_stats & total_error) {
|
||||
|
||||
assert(tensor_is_contiguous(layer));
|
||||
error_stats layer_error {};
|
||||
int64_t nelements = ggml_nelements(layer);
|
||||
|
||||
for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) {
|
||||
int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset);
|
||||
|
||||
if (layer->type == GGML_TYPE_F16) {
|
||||
for (int i = 0; i < chunk_size; i++) {
|
||||
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
|
||||
}
|
||||
} else {
|
||||
input_scratch = ggml_get_data_f32(layer) + offset;
|
||||
}
|
||||
|
||||
if (use_reference) {
|
||||
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
|
||||
} else {
|
||||
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
|
||||
}
|
||||
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
|
||||
|
||||
update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
|
||||
if (print_layer_stats) {
|
||||
update_error_stats(chunk_size, input_scratch, output_scratch, layer_error);
|
||||
}
|
||||
}
|
||||
if (print_layer_stats) {
|
||||
print_error_stats(name, layer_error, false);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
quantize_stats_params params;
|
||||
|
||||
// read command line
|
||||
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
quantize_stats_print_usage(argc, argv);
|
||||
exit(0);
|
||||
} else if (arg == "-r" || arg == "--reference") {
|
||||
params.reference = true;
|
||||
} else if (arg == "-v") {
|
||||
params.verbose = true;
|
||||
} else if (arg == "-p" || arg == "--per-layer-stats") {
|
||||
params.per_layer_stats = true;
|
||||
} else if (arg == "--histogram") {
|
||||
params.print_histogram = true;
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model = argv[i];
|
||||
} else if (arg == "-l" || arg == "--include-layer") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.include_layers.push_back(argv[i]);
|
||||
} else if (arg == "-L" || arg == "--exclude-layer") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.exclude_layers.push_back(argv[i]);
|
||||
} else if (arg == "-t" || arg == "--type") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
int j;
|
||||
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
|
||||
// find match
|
||||
}
|
||||
if (j < GGML_TYPE_COUNT) {
|
||||
params.include_types.push_back((ggml_type) j);
|
||||
} else {
|
||||
fprintf(stderr, "error: %s not in list of types\n", argv[i]);
|
||||
invalid_param = true;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
quantize_stats_print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
quantize_stats_print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// load the model
|
||||
fprintf(stderr, "Loading model\n");
|
||||
|
||||
const int64_t t_main_start_us = ggml_time_us();
|
||||
llama_context * ctx;
|
||||
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = 256;
|
||||
lparams.n_parts = 1;
|
||||
lparams.seed = 1;
|
||||
lparams.f16_kv = false;
|
||||
lparams.use_mlock = false;
|
||||
|
||||
ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort tensors for consistent output
|
||||
const auto tensors = llama_internal_get_tensor_map(ctx);
|
||||
std::map<std::string, struct ggml_tensor *> tensors_sorted { tensors.begin(), tensors.end() };
|
||||
|
||||
// check layer tensors
|
||||
int included_layers = 0;
|
||||
int64_t max_nelements = 0;
|
||||
bool is_f16 = false;
|
||||
for (const auto& kv_tensor : tensors_sorted) {
|
||||
if (!layer_included(params, kv_tensor.first)) {
|
||||
continue;
|
||||
}
|
||||
if (params.verbose) {
|
||||
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
|
||||
}
|
||||
if (kv_tensor.second->type == GGML_TYPE_F16) {
|
||||
is_f16 = true;
|
||||
} else if (kv_tensor.second->type != GGML_TYPE_F32) {
|
||||
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
||||
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
||||
llama_free(ctx);
|
||||
return 1;
|
||||
}
|
||||
included_layers++;
|
||||
max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
|
||||
}
|
||||
|
||||
if (is_f16) {
|
||||
printf("note: source model is f16\n");
|
||||
}
|
||||
printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
|
||||
// allocate scratch space
|
||||
std::vector<float> input_scratch(SCRATCH_ELEMENTS);
|
||||
std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
|
||||
std::vector<float> output_scratch(SCRATCH_ELEMENTS);
|
||||
|
||||
// loop throught quantization types
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
if (params.verbose) {
|
||||
printf("testing %s ...\n", type_strs[i]);
|
||||
}
|
||||
|
||||
error_stats global_stats {};
|
||||
|
||||
for (const auto& kv_tensor : tensors_sorted) {
|
||||
if (!layer_included(params, kv_tensor.first)) {
|
||||
continue;
|
||||
}
|
||||
if (params.verbose) {
|
||||
printf(" %s ...\n", kv_tensor.first.c_str());
|
||||
}
|
||||
std::string layer_name { type_strs[i] };
|
||||
layer_name += "::" + kv_tensor.first;
|
||||
test_roundtrip_on_layer(
|
||||
layer_name,
|
||||
params.per_layer_stats,
|
||||
qfns,
|
||||
params.reference,
|
||||
kv_tensor.second,
|
||||
input_scratch.data(),
|
||||
quantized_scratch.data(),
|
||||
output_scratch.data(),
|
||||
global_stats
|
||||
);
|
||||
}
|
||||
|
||||
print_error_stats(type_strs[i], global_stats, params.print_histogram);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
llama_free(ctx);
|
||||
// report timing
|
||||
{
|
||||
const int64_t t_main_end_us = ggml_time_us();
|
||||
|
||||
printf("\n");
|
||||
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
30
ggml.c
30
ggml.c
|
@ -6564,29 +6564,27 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
//}
|
||||
}
|
||||
|
||||
typedef void (*dequantize_row_q_t)(const void * restrict x, float * restrict y, int k);
|
||||
typedef void (*quantize_row_q_t)(const float * restrict x, void * restrict y, int k);
|
||||
typedef void (*vec_dot_q_t)(const int n, float * restrict s, const void * restrict x, const void * restrict y);
|
||||
|
||||
typedef struct {
|
||||
dequantize_row_q_t dequantize_row_q;
|
||||
quantize_row_q_t quantize_row_q;
|
||||
vec_dot_q_t vec_dot_q;
|
||||
} quantize_fns_t;
|
||||
|
||||
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_Q4_0] = {
|
||||
.dequantize_row_q = dequantize_row_q4_0,
|
||||
.quantize_row_q = quantize_row_q4_0,
|
||||
.vec_dot_q = ggml_vec_dot_q4_0,
|
||||
.dequantize_row_q = dequantize_row_q4_0,
|
||||
.quantize_row_q = quantize_row_q4_0,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
|
||||
.vec_dot_q = ggml_vec_dot_q4_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_1] = {
|
||||
.dequantize_row_q = dequantize_row_q4_1,
|
||||
.quantize_row_q = quantize_row_q4_1,
|
||||
.vec_dot_q = ggml_vec_dot_q4_1,
|
||||
.dequantize_row_q = dequantize_row_q4_1,
|
||||
.quantize_row_q = quantize_row_q4_1,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
|
||||
.vec_dot_q = ggml_vec_dot_q4_1,
|
||||
},
|
||||
};
|
||||
|
||||
// For internal test use
|
||||
quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
||||
GGML_ASSERT(i < GGML_TYPE_COUNT);
|
||||
return quantize_fns[i];
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_mul_mat_q_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
|
|
24
ggml.h
24
ggml.h
|
@ -783,6 +783,30 @@ int ggml_cpu_has_blas(void);
|
|||
int ggml_cpu_has_sse3(void);
|
||||
int ggml_cpu_has_vsx(void);
|
||||
|
||||
|
||||
//
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
#else
|
||||
#define GGML_RESTRICT restrict
|
||||
#endif
|
||||
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
||||
|
||||
typedef struct {
|
||||
dequantize_row_q_t dequantize_row_q;
|
||||
quantize_row_q_t quantize_row_q;
|
||||
quantize_row_q_t quantize_row_q_reference;
|
||||
vec_dot_q_t vec_dot_q;
|
||||
} quantize_fns_t;
|
||||
|
||||
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
22
llama.cpp
22
llama.cpp
|
@ -1238,19 +1238,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|||
}
|
||||
}
|
||||
|
||||
if (top_k > 0 && top_k < n_logits) {
|
||||
sample_top_k(logits_id, top_k);
|
||||
}
|
||||
|
||||
float maxl = -std::numeric_limits<float>::infinity();
|
||||
for (const auto & kv : logits_id) {
|
||||
maxl = Max(maxl, kv.first);
|
||||
}
|
||||
sample_top_k(logits_id, top_k > 0 ? Min(top_k, n_logits) : n_logits);
|
||||
|
||||
// compute probs for the top k tokens
|
||||
std::vector<float> probs;
|
||||
probs.reserve(logits_id.size());
|
||||
|
||||
float maxl = logits_id[0].first;
|
||||
double sum = 0.0;
|
||||
for (const auto & kv : logits_id) {
|
||||
const float p = expf(kv.first - maxl);
|
||||
|
@ -1273,16 +1267,11 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cumsum = 1.0/cumsum;
|
||||
for (int i = 0; i < (int) probs.size(); i++) {
|
||||
probs[i] *= cumsum;
|
||||
}
|
||||
}
|
||||
|
||||
//printf("\n");
|
||||
//for (int i = 0; i < (int) 10; i++) {
|
||||
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
||||
// printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
|
||||
//}
|
||||
//printf("\n\n");
|
||||
//exit(0);
|
||||
|
@ -1865,3 +1854,8 @@ const char * llama_print_system_info(void) {
|
|||
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
// For internal test use
|
||||
std::unordered_map<std::string, struct ggml_tensor *>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
||||
return ctx->model.tensors;
|
||||
}
|
||||
|
|
7
llama.h
7
llama.h
|
@ -164,6 +164,13 @@ extern "C" {
|
|||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
//
|
||||
// Internal function exposed for tests and benchmarks
|
||||
//
|
||||
std::unordered_map<std::string, struct ggml_tensor *>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue