Merge branch 'master' into concedo_experimental

# Conflicts:
#	CMakeLists.txt
#	SHA256SUMS
This commit is contained in:
Concedo 2023-06-11 23:27:28 +08:00
commit b9a4da3c6f
5 changed files with 26 additions and 5 deletions

View file

@ -632,6 +632,9 @@ void console_set_color(console_state & con_st, console_color_t color) {
case CONSOLE_COLOR_USER_INPUT: case CONSOLE_COLOR_USER_INPUT:
fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN); fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
break; break;
case CONSOLE_COLOR_ERROR:
fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
break;
} }
con_st.color = color; con_st.color = color;
fflush(con_st.out); fflush(con_st.out);

View file

@ -112,7 +112,8 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
enum console_color_t { enum console_color_t {
CONSOLE_COLOR_DEFAULT=0, CONSOLE_COLOR_DEFAULT=0,
CONSOLE_COLOR_PROMPT, CONSOLE_COLOR_PROMPT,
CONSOLE_COLOR_USER_INPUT CONSOLE_COLOR_USER_INPUT,
CONSOLE_COLOR_ERROR
}; };
struct console_state { struct console_state {

View file

@ -81,6 +81,9 @@ int main(int argc, char ** argv) {
if (params.n_ctx > 2048) { if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx); "expect poor results\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) {
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@ -331,6 +334,19 @@ int main(int argc, char ** argv) {
while ((n_remain != 0 && !is_antiprompt) || params.interactive) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict // predict
if (embd.size() > 0) { if (embd.size() > 0) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
auto max_embd_size = n_ctx - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int)embd.size() > max_embd_size) {
auto skipped_tokens = embd.size() - max_embd_size;
console_set_color(con_st, CONSOLE_COLOR_ERROR);
printf("<<input too long: skipped %ld token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
fflush(stdout);
embd.resize(max_embd_size);
}
// infinite text generation via context swapping // infinite text generation via context swapping
// if we run out of context: // if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past) // - take the n_keep first tokens from the original prompt (via n_past)

View file

@ -1105,6 +1105,9 @@ void * ggml_cuda_host_malloc(size_t size) {
void * ptr = nullptr; void * ptr = nullptr;
cudaError_t err = cudaMallocHost((void **) &ptr, size); cudaError_t err = cudaMallocHost((void **) &ptr, size);
if (err != cudaSuccess) { if (err != cudaSuccess) {
// The allocation error can be bypassed. A null ptr will assigned out of this function.
// This can fixed the OOM error in WSL.
cudaGetLastError();
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
size/1024.0/1024.0, cudaGetErrorString(err)); size/1024.0/1024.0, cudaGetErrorString(err));
return nullptr; return nullptr;

View file

@ -1519,7 +1519,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t m4b = vdupq_n_u8(0xf);
#ifdef __ARM_FEATURE_DOTPROD #ifdef __ARM_FEATURE_DOTPROD
const uint32x4_t mzero = vdupq_n_s32(0); const int32x4_t mzero = vdupq_n_s32(0);
#endif #endif
int8x16x2_t q4bytes; int8x16x2_t q4bytes;
@ -1745,7 +1745,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
#ifdef __ARM_NEON #ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t m4b = vdupq_n_u8(0xf);
const uint32x4_t mzero = vdupq_n_u32(0); const int32x4_t mzero = vdupq_n_s32(0);
const uint8x16_t mone = vdupq_n_u8(1); const uint8x16_t mone = vdupq_n_u8(1);
const uint8x16_t mtwo = vdupq_n_u8(2); const uint8x16_t mtwo = vdupq_n_u8(2);
@ -2242,5 +2242,3 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
*s = sumf; *s = sumf;
#endif #endif
} }