Merge branch 'master' into gg/flash-attn
ggml-ci
This commit is contained in:
commit
cb3547ac46
3 changed files with 20 additions and 11 deletions
|
@ -93,6 +93,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
|
|
||||||
- [X] LLaMA 🦙
|
- [X] LLaMA 🦙
|
||||||
- [x] LLaMA 2 🦙🦙
|
- [x] LLaMA 2 🦙🦙
|
||||||
|
- [x] LLaMA 3 🦙🦙🦙
|
||||||
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
||||||
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
||||||
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
|
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
|
||||||
|
@ -119,8 +120,9 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
||||||
- [x] [Gemma](https://ai.google.dev/gemma)
|
- [x] [Gemma](https://ai.google.dev/gemma)
|
||||||
- [x] [Mamba](https://github.com/state-spaces/mamba)
|
- [x] [Mamba](https://github.com/state-spaces/mamba)
|
||||||
|
- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
|
||||||
- [x] [Xverse](https://huggingface.co/models?search=xverse)
|
- [x] [Xverse](https://huggingface.co/models?search=xverse)
|
||||||
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
|
- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
|
||||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||||
- [x] [OLMo](https://allenai.org/olmo)
|
- [x] [OLMo](https://allenai.org/olmo)
|
||||||
|
@ -135,6 +137,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
|
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
|
||||||
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
||||||
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
||||||
|
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
||||||
|
|
||||||
**HTTP server**
|
**HTTP server**
|
||||||
|
|
||||||
|
|
|
@ -160,8 +160,9 @@ function gg_run_test_scripts_debug {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
# TODO: too slow, run on dedicated node
|
||||||
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||||
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
23
llama.cpp
23
llama.cpp
|
@ -2999,9 +2999,13 @@ struct llama_model_loader {
|
||||||
|
|
||||||
ggml_tensor * tensor;
|
ggml_tensor * tensor;
|
||||||
|
|
||||||
llama_tensor_weight(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
|
||||||
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
|
||||||
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
|
||||||
|
|
||||||
|
if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
|
||||||
|
throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
std::vector<llama_tensor_weight> weights;
|
std::vector<llama_tensor_weight> weights;
|
||||||
|
@ -3040,15 +3044,15 @@ struct llama_model_loader {
|
||||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||||
|
|
||||||
|
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
||||||
|
contexts.emplace_back(ctx);
|
||||||
|
|
||||||
// Save tensors data offset of the main file.
|
// Save tensors data offset of the main file.
|
||||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||||
// so we build a unified tensors index for weights.
|
// so we build a unified tensors index for weights.
|
||||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
weights.emplace_back(0, cur->name, meta, cur);
|
weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
|
||||||
}
|
}
|
||||||
files.emplace_back(new llama_file(fname.c_str(), "rb"));
|
|
||||||
contexts.emplace_back(ctx);
|
|
||||||
|
|
||||||
uint16_t n_split = 0;
|
uint16_t n_split = 0;
|
||||||
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
||||||
|
|
||||||
|
@ -3082,13 +3086,14 @@ struct llama_model_loader {
|
||||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save tensors data offset info of the shard.
|
|
||||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
||||||
weights.emplace_back(idx, cur->name, ctx_gguf, cur);
|
|
||||||
}
|
|
||||||
files.emplace_back(new llama_file(split_path, "rb"));
|
files.emplace_back(new llama_file(split_path, "rb"));
|
||||||
contexts.emplace_back(ctx);
|
contexts.emplace_back(ctx);
|
||||||
|
|
||||||
|
// Save tensors data offset info of the shard.
|
||||||
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
|
weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
|
||||||
|
}
|
||||||
|
|
||||||
gguf_free(ctx_gguf);
|
gguf_free(ctx_gguf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue