Merge branch 'ggerganov:master' into fix

This commit is contained in:
Xinpeng Dou 2024-09-12 12:18:39 +08:00 committed by GitHub
commit d9132f7846
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 84 additions and 57 deletions

View file

@ -89,6 +89,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct) - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a) - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

View file

@ -941,11 +941,37 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#ifdef LLAMA_USE_CURL #ifdef LLAMA_USE_CURL
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
static bool starts_with(const std::string & str, const std::string & prefix) { static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with... // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0; return str.rfind(prefix, 0) == 0;
} }
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;
while (remaining_attempts > 0) {
fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
CURLcode res = curl_easy_perform(curl);
if (res == CURLE_OK) {
return true;
}
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
remaining_attempts--;
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
}
fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
return false;
}
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl // Initialize libcurl
@ -1049,9 +1075,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback)); curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
CURLcode res = curl_easy_perform(curl.get()); bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (res != CURLE_OK) { if (!was_perform_successful) {
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
return false; return false;
} }
@ -1126,11 +1151,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
}; };
// start the download // start the download
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
auto res = curl_easy_perform(curl.get()); bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (res != CURLE_OK) { if (!was_perform_successful) {
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
return false; return false;
} }

View file

@ -31,6 +31,7 @@ import re
import requests import requests
import sys import sys
import json import json
import shutil
from hashlib import sha256 from hashlib import sha256
from enum import IntEnum, auto from enum import IntEnum, auto
@ -125,12 +126,27 @@ def download_model(model):
if tokt == TOKENIZER_TYPE.UGM: if tokt == TOKENIZER_TYPE.UGM:
files.append("spiece.model") files.append("spiece.model")
for file in files: if os.path.isdir(repo):
save_path = f"models/tokenizers/{name}/{file}" # If repo is a path on the file system, copy the directory
if os.path.isfile(save_path): for file in files:
logger.info(f"{name}: File {save_path} already exists - skipping") src_path = os.path.join(repo, file)
continue dst_path = f"models/tokenizers/{name}/{file}"
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) if os.path.isfile(dst_path):
logger.info(f"{name}: File {dst_path} already exists - skipping")
continue
if os.path.isfile(src_path):
shutil.copy2(src_path, dst_path)
logger.info(f"{name}: Copied {src_path} to {dst_path}")
else:
logger.warning(f"{name}: Source file {src_path} does not exist")
else:
# If repo is a URL, download the files
for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):
logger.info(f"{name}: File {save_path} already exists - skipping")
continue
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
for model in models: for model in models:

View file

@ -3,32 +3,10 @@
#include "llama.h" #include "llama.h"
#include <algorithm> #include <algorithm>
#include <cmath>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <vector> #include <vector>
// mutates the input string
static std::vector<int> parse_list(char * p) {
std::vector<int> ret;
char * q = p;
while (*p) {
if (*p == ',') {
*p = '\0';
ret.push_back(std::atoi(q));
q = p + 1;
}
++p;
}
ret.push_back(std::atoi(q));
return ret;
}
static void print_usage(int, char ** argv) { static void print_usage(int, char ** argv) {
LOG_TEE("\nexample usage:\n"); LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);

View file

@ -18,8 +18,8 @@ struct llava_context {
}; };
static void show_additional_info(int /*argc*/, char ** argv) { static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
} }
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) { if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
return 1; return 1;
} }

View file

@ -2009,7 +2009,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__); GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
return nullptr; return nullptr;
} }
ggml_cann_set_device(ctx->device);
ggml_backend_t cann_backend = ggml_backend_t cann_backend =
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
/* .interface = */ ggml_backend_cann_interface, /* .interface = */ ggml_backend_cann_interface,

View file

@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q(
// nrows_dst == nrows of the matrix that the kernel writes into // nrows_dst == nrows of the matrix that the kernel writes into
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst}; // The stream-k decomposition is only faster for recent NVIDIA GPUs.
// Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:

View file

@ -2742,6 +2742,7 @@ struct mmq_args {
int64_t ne00; int64_t ne01; int64_t stride01; int64_t ne00; int64_t ne01; int64_t stride01;
int64_t ne10; int64_t ne11; int64_t stride11; int64_t ne10; int64_t ne11; int64_t stride11;
int64_t ne0; int64_t ne0;
bool use_stream_k;
}; };
template<ggml_type type> template<ggml_type type>
@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
const dim3 block_nums_xy_tiling(nty, ntx, 1); const dim3 block_nums_xy_tiling(nty, ntx, 1);
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; if (!args.use_stream_k) {
if (!use_stream_k) {
if (args.ne01 % mmq_y == 0) { if (args.ne01 % mmq_y == 0) {
constexpr bool need_check = false; constexpr bool need_check = false;
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>> mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>

View file

@ -16080,19 +16080,21 @@ static int llama_decode_internal(
return -1; return -1;
} }
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
return -1;
}
}
const auto & model = lctx.model; const auto & model = lctx.model;
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams; const auto & cparams = lctx.cparams;
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
if (batch_all.token) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
return -1;
}
}
}
GGML_ASSERT(n_tokens_all <= cparams.n_batch); GGML_ASSERT(n_tokens_all <= cparams.n_batch);
GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@ -16379,19 +16381,21 @@ static int llama_encode_internal(
return -1; return -1;
} }
for (uint32_t i = 0; i < n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
return -1;
}
}
const auto & model = lctx.model; const auto & model = lctx.model;
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams; const auto & cparams = lctx.cparams;
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
if (batch.token) {
for (uint32_t i = 0; i < n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
return -1;
}
}
}
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens"); GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");