Merge branch 'master' into concedo
# Conflicts: # .devops/full.Dockerfile # Makefile # flake.nix
This commit is contained in:
commit
d00b865eb1
6 changed files with 305 additions and 63 deletions
80
Makefile
80
Makefile
|
@ -139,10 +139,10 @@ NOAVX2_BUILD =
|
|||
OPENBLAS_NOAVX2_BUILD =
|
||||
|
||||
ifeq ($(OS),Windows_NT)
|
||||
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas.dll $(LDFLAGS)
|
||||
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/OpenCL.lib lib/clblast.lib -shared -o koboldcpp_clblast.dll $(LDFLAGS)
|
||||
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas_noavx2.dll $(LDFLAGS)
|
||||
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o -shared -o koboldcpp_noavx2.dll $(LDFLAGS)
|
||||
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@ $(LDFLAGS)
|
||||
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@ $(LDFLAGS)
|
||||
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@ $(LDFLAGS)
|
||||
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@ $(LDFLAGS)
|
||||
else
|
||||
ifndef LLAMA_OPENBLAS
|
||||
ifndef LLAMA_CLBLAST
|
||||
|
@ -166,101 +166,101 @@ $(info I CC: $(CCV))
|
|||
$(info I CXX: $(CXXV))
|
||||
$(info )
|
||||
|
||||
default: llamalib llamalib_noavx2 llamalib_openblas llamalib_openblas_noavx2 llamalib_clblast
|
||||
simple: llamalib llamalib_noavx2
|
||||
dev: llamalib_openblas
|
||||
default: koboldcpp.dll koboldcpp_noavx2.dll koboldcpp_openblas.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll
|
||||
simple: koboldcpp.dll koboldcpp_noavx2.dll
|
||||
dev: koboldcpp_openblas.dll
|
||||
|
||||
#
|
||||
# Build library
|
||||
#
|
||||
|
||||
ggml.o: ggml.c ggml.h
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c ggml.c -o ggml.o
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@
|
||||
|
||||
ggml_openblas.o: ggml.c ggml.h
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas.o
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -DGGML_USE_OPENBLAS -c $< -o $@
|
||||
|
||||
ggml_noavx2.o: ggml.c ggml.h
|
||||
$(CC) $(CFLAGS) -c ggml.c -o ggml_noavx2.o
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ggml_openblas_noavx2.o: ggml.c ggml.h
|
||||
$(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas_noavx2.o
|
||||
$(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c $< -o $@
|
||||
|
||||
ggml_clblast.o: ggml.c ggml.h
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -DGGML_USE_OPENBLAS -DGGML_USE_CLBLAST -c ggml.c -o ggml_clblast.o
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -DGGML_USE_OPENBLAS -DGGML_USE_CLBLAST -c $< -o $@
|
||||
|
||||
ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c otherarch/ggml_v1.c -o ggml_v1.o
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@
|
||||
|
||||
ggml_v1_noavx2.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) -c otherarch/ggml_v1.c -o ggml_v1_noavx2.o
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) -c $< -o $@
|
||||
|
||||
llama.o: llama.cpp llama.h llama_util.h
|
||||
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
expose.o: expose.cpp expose.h
|
||||
$(CXX) $(CXXFLAGS) -c expose.cpp -o expose.o
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
llama_adapter.o:
|
||||
$(CXX) $(CXXFLAGS) -c llama_adapter.cpp -o llama_adapter.o
|
||||
llama_adapter.o: llama_adapter.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
gpttype_adapter.o:
|
||||
$(CXX) $(CXXFLAGS) -c gpttype_adapter.cpp -o gpttype_adapter.o
|
||||
gpttype_adapter.o: gpttype_adapter.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
clean:
|
||||
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize-stats perplexity embedding benchmark-q4_0-matmult main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll gptj.exe gpt2.exe
|
||||
|
||||
main: examples/main/main.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
@echo
|
||||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo
|
||||
|
||||
llamalib: ggml.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||
$(CXX) $(CXXFLAGS) ggml.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o -shared -o koboldcpp.dll $(LDFLAGS)
|
||||
koboldcpp.dll: ggml.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||
$(CXX) $(CXXFLAGS) $^ -shared -o $@ $(LDFLAGS)
|
||||
|
||||
llamalib_openblas: ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||
koboldcpp_openblas.dll: ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||
$(OPENBLAS_BUILD)
|
||||
|
||||
llamalib_noavx2: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||
|
||||
koboldcpp_noavx2.dll: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||
$(NOAVX2_BUILD)
|
||||
|
||||
llamalib_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||
koboldcpp_openblas_noavx2.dll: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||
$(OPENBLAS_NOAVX2_BUILD)
|
||||
|
||||
llamalib_clblast: ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||
koboldcpp_clblast.dll: ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
|
||||
$(CLBLAST_BUILD)
|
||||
|
||||
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize_llama $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
quantize_gptj: ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) otherarch/gptj_quantize.cpp ggml.o llama.o -o quantize_gptj $(LDFLAGS)
|
||||
quantize_gptj: ggml.o llama.o otherarch/gptj_quantize.cpp
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
quantize_gpt2: ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) otherarch/gpt2_quantize.cpp ggml.o llama.o -o quantize_gpt2 $(LDFLAGS)
|
||||
quantize_gpt2: ggml.o llama.o otherarch/gpt2_quantize.cpp
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
libllama.so: llama.o ggml.o
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
||||
benchmark: ggml.o
|
||||
$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
|
||||
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
|
||||
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
|
||||
./benchmark-q4_0-matmult
|
||||
|
||||
.PHONY: tests
|
||||
|
|
|
@ -16,9 +16,6 @@
|
|||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
|
||||
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
|
||||
|
||||
struct quantize_stats_params {
|
||||
std::string model = "models/7B/ggml-model-f16.bin";
|
||||
bool verbose = false;
|
||||
|
@ -224,7 +221,7 @@ int main(int argc, char ** argv) {
|
|||
break;
|
||||
}
|
||||
int j;
|
||||
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
|
||||
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) {
|
||||
// find match
|
||||
}
|
||||
if (j < GGML_TYPE_COUNT) {
|
||||
|
@ -279,7 +276,7 @@ int main(int argc, char ** argv) {
|
|||
continue;
|
||||
}
|
||||
if (params.verbose) {
|
||||
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
|
||||
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
|
||||
}
|
||||
if (kv_tensor.second->type == GGML_TYPE_F16) {
|
||||
is_f16 = true;
|
||||
|
@ -304,13 +301,14 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// loop throught quantization types
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
const ggml_type type = (ggml_type) i;
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
if (params.verbose) {
|
||||
printf("testing %s ...\n", type_strs[i]);
|
||||
printf("testing %s ...\n", ggml_type_name(type));
|
||||
}
|
||||
|
||||
error_stats global_stats {};
|
||||
|
@ -322,7 +320,7 @@ int main(int argc, char ** argv) {
|
|||
if (params.verbose) {
|
||||
printf(" %s ...\n", kv_tensor.first.c_str());
|
||||
}
|
||||
std::string layer_name { type_strs[i] };
|
||||
std::string layer_name { ggml_type_name(type) };
|
||||
layer_name += "::" + kv_tensor.first;
|
||||
test_roundtrip_on_layer(
|
||||
layer_name,
|
||||
|
@ -337,7 +335,7 @@ int main(int argc, char ** argv) {
|
|||
);
|
||||
}
|
||||
|
||||
print_error_stats(type_strs[i], global_stats, params.print_histogram);
|
||||
print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
238
ggml.c
238
ggml.c
|
@ -2671,6 +2671,18 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
|||
};
|
||||
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated");
|
||||
|
||||
|
||||
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_F32] = "f32",
|
||||
[GGML_TYPE_F16] = "f16",
|
||||
[GGML_TYPE_Q4_0] = "q4_0",
|
||||
[GGML_TYPE_Q4_1] = "q4_1",
|
||||
[GGML_TYPE_I8] = "i8",
|
||||
[GGML_TYPE_I16] = "i16",
|
||||
[GGML_TYPE_I32] = "i32",
|
||||
};
|
||||
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_NAME is outdated");
|
||||
|
||||
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
||||
"NONE",
|
||||
|
||||
|
@ -2712,9 +2724,12 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
|||
|
||||
"FLASH_ATTN",
|
||||
"FLASH_FF",
|
||||
|
||||
"MAP_UNARY",
|
||||
"MAP_BINARY",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
|
||||
static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
|
@ -2757,9 +2772,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
|
||||
"flash_attn(x)",
|
||||
"flash_ff(x)",
|
||||
|
||||
"f(x)",
|
||||
"f(x,y)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
|
||||
static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
|
||||
|
||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
||||
|
@ -2889,6 +2907,11 @@ float ggml_type_sizef(enum ggml_type type) {
|
|||
return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
|
||||
}
|
||||
|
||||
const char * ggml_type_name(enum ggml_type type) {
|
||||
return GGML_TYPE_NAME[type];
|
||||
}
|
||||
|
||||
|
||||
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
||||
return GGML_TYPE_SIZE[tensor->type];
|
||||
}
|
||||
|
@ -4907,6 +4930,90 @@ struct ggml_tensor * ggml_flash_ff(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml_map_unary
|
||||
|
||||
struct ggml_tensor * ggml_map_unary_impl_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_unary_op_f32_t fun,
|
||||
bool inplace) {
|
||||
bool is_node = false;
|
||||
|
||||
if (!inplace && a->grad) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
||||
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
||||
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
result->op = GGML_OP_MAP_UNARY;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src0 = a;
|
||||
result->opt[0] = addr_tensor;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_map_unary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_unary_op_f32_t fun) {
|
||||
return ggml_map_unary_impl_f32(ctx, a, fun, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_map_unary_inplace_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_unary_op_f32_t fun) {
|
||||
return ggml_map_unary_impl_f32(ctx, a, fun, true);
|
||||
}
|
||||
|
||||
// ggml_map_binary
|
||||
|
||||
struct ggml_tensor * ggml_map_binary_impl_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
const ggml_binary_op_f32_t fun,
|
||||
bool inplace) {
|
||||
GGML_ASSERT(ggml_are_same_shape(a, b));
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (!inplace && (a->grad || b->grad)) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
|
||||
*((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
|
||||
struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
result->op = GGML_OP_MAP_BINARY;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src0 = a;
|
||||
result->src1 = b;
|
||||
result->opt[0] = addr_tensor;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_map_binary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
const ggml_binary_op_f32_t fun) {
|
||||
return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_map_binary_inplace_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
const ggml_binary_op_f32_t fun) {
|
||||
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void ggml_set_param(
|
||||
|
@ -8875,6 +8982,111 @@ static void ggml_compute_forward_flash_ff(
|
|||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_unary
|
||||
|
||||
static void ggml_compute_forward_map_unary_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_unary_op_f32_t fun) {
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int nc = src0->ne[0];
|
||||
|
||||
assert( dst->nb[0] == sizeof(float));
|
||||
assert(src0->nb[0] == sizeof(float));
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
fun(nc,
|
||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_map_unary(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_unary_op_f32_t fun) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_COUNT:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_binary
|
||||
|
||||
static void ggml_compute_forward_map_binary_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_binary_op_f32_t fun) {
|
||||
assert(params->ith == 0);
|
||||
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int nc = src0->ne[0];
|
||||
|
||||
assert( dst->nb[0] == sizeof(float));
|
||||
assert(src0->nb[0] == sizeof(float));
|
||||
assert(src1->nb[0] == sizeof(float));
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
fun(nc,
|
||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i*(src0->nb[1])),
|
||||
(float *) ((char *) src1->data + i*(src1->nb[1])));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_map_binary(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_binary_op_f32_t fun) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_COUNT:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////
|
||||
|
||||
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
|
@ -9024,6 +9236,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{
|
||||
ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
|
||||
} break;
|
||||
case GGML_OP_MAP_UNARY:
|
||||
{
|
||||
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
|
||||
ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_BINARY:
|
||||
{
|
||||
const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data);
|
||||
ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_NONE:
|
||||
{
|
||||
// nop
|
||||
|
@ -9283,6 +9507,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
{
|
||||
GGML_ASSERT(false); // not supported
|
||||
} break;
|
||||
case GGML_OP_MAP_UNARY:
|
||||
case GGML_OP_MAP_BINARY:
|
||||
{
|
||||
GGML_ASSERT(false); // not supported
|
||||
} break;
|
||||
case GGML_OP_NONE:
|
||||
{
|
||||
// nop
|
||||
|
@ -9775,6 +10004,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_MAP_UNARY:
|
||||
case GGML_OP_MAP_BINARY:
|
||||
{
|
||||
node->n_tasks = 1;
|
||||
} break;
|
||||
case GGML_OP_NONE:
|
||||
{
|
||||
node->n_tasks = 1;
|
||||
|
|
20
ggml.h
20
ggml.h
|
@ -253,6 +253,9 @@ enum ggml_op {
|
|||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_FF,
|
||||
|
||||
GGML_OP_MAP_UNARY,
|
||||
GGML_OP_MAP_BINARY,
|
||||
|
||||
GGML_OP_COUNT,
|
||||
};
|
||||
|
||||
|
@ -351,6 +354,8 @@ int ggml_blck_size (enum ggml_type type);
|
|||
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
||||
|
||||
const char * ggml_type_name(enum ggml_type type);
|
||||
|
||||
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
|
||||
struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||
|
@ -652,6 +657,21 @@ struct ggml_tensor * ggml_flash_ff(
|
|||
struct ggml_tensor * c0,
|
||||
struct ggml_tensor * c1);
|
||||
|
||||
// Mapping operations
|
||||
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
||||
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||
|
||||
struct ggml_tensor * ggml_map_unary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_unary_op_f32_t fun);
|
||||
|
||||
struct ggml_tensor * ggml_map_binary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
const ggml_binary_op_f32_t fun);
|
||||
|
||||
//
|
||||
// automatic differentiation
|
||||
//
|
||||
|
|
14
llama.cpp
14
llama.cpp
|
@ -269,16 +269,6 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
static const char * llama_format_type(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32: return "f32";
|
||||
case GGML_TYPE_F16: return "f16";
|
||||
case GGML_TYPE_Q4_0: return "q4_0";
|
||||
case GGML_TYPE_Q4_1: return "q4_1";
|
||||
default: LLAMA_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
||||
size_t size = ggml_type_size(type);
|
||||
for (uint32_t dim : ne) {
|
||||
|
@ -1589,7 +1579,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
printf("[%zu/%zu] %36s - %s, type = %6s, ",
|
||||
++idx, model_loader->tensors_map.tensors.size(),
|
||||
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
||||
llama_format_type(tensor.type));
|
||||
ggml_type_name(tensor.type));
|
||||
|
||||
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
||||
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
||||
|
@ -1622,7 +1612,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
||||
}
|
||||
} else {
|
||||
throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type));
|
||||
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
|
||||
}
|
||||
|
||||
printf("quantizing .. ");
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
numpy==1.24
|
||||
sentencepiece==0.1.97
|
||||
sentencepiece==0.1.98
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue