diff --git a/convert.py b/convert.py index e3f1096e1..8d7af06d1 100755 --- a/convert.py +++ b/convert.py @@ -142,9 +142,9 @@ def find_n_mult(n_ff: int, n_embd: int) -> int: @dataclass class Params: n_vocab: int - n_embd: int - n_mult: int - n_head: int + n_embd: int + n_mult: int + n_head: int n_layer: int @staticmethod @@ -167,11 +167,11 @@ class Params: n_head=n_embd // 128 # guessed return Params( - n_vocab=n_vocab, - n_embd=n_embd, - n_mult=256, - n_head=n_head, - n_layer=n_layer, + n_vocab = n_vocab, + n_embd = n_embd, + n_mult = 256, + n_head = n_head, + n_layer = n_layer, ) @staticmethod @@ -179,28 +179,53 @@ class Params: config = json.load(open(config_path)) n_vocab = config["vocab_size"]; - n_embd = config["hidden_size"]; - n_head = config["num_attention_heads"]; + n_embd = config["hidden_size"]; + n_head = config["num_attention_heads"]; n_layer = config["num_hidden_layers"]; - n_ff = config["intermediate_size"]; + n_ff = config["intermediate_size"]; n_mult = find_n_mult(n_ff, n_embd); return Params( - n_vocab=n_vocab, - n_embd=n_embd, - n_mult=n_mult, - n_head=n_head, - n_layer=n_layer, + n_vocab = n_vocab, + n_embd = n_embd, + n_mult = n_mult, + n_head = n_head, + n_layer = n_layer, + ) + + # LLaMA v2 70B params.json + # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1 + @staticmethod + def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params': + config = json.load(open(config_path)) + + n_vocab = config["vocab_size"]; + n_embd = config["dim"]; + n_head = config["n_heads"]; + n_layer = config["n_layers"]; + n_mult = config["multiple_of"]; + + if n_vocab == -1: + n_vocab = model["tok_embeddings.weight"].shape[0] + + return Params( + n_vocab = n_vocab, + n_embd = n_embd, + n_mult = n_mult, + n_head = n_head, + n_layer = n_layer, ) @staticmethod def load(model_plus: 'ModelPlus') -> 'Params': + hf_config_path = model_plus.paths[0].parent / "config.json" orig_config_path = model_plus.paths[0].parent / "params.json" - hf_transformer_config_path = model_plus.paths[0].parent / "config.json" - if hf_transformer_config_path.exists(): - params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path) + if hf_config_path.exists(): + params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) + elif orig_config_path.exists(): + params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) else: params = Params.guessed(model_plus.model) @@ -1036,8 +1061,7 @@ class OutputFile: @staticmethod def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: of = OutputFile(fname_out) - params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, - n_head=1, n_layer=0) + params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0) of = OutputFile(fname_out) of.write_file_header(params, file_type=GGMLFileType.AllF32) of.write_vocab(vocab) diff --git a/examples/common.cpp b/examples/common.cpp index 2dc6654da..7a1928f2b 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -117,6 +117,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_threads = std::stoi(argv[i]); + if (params.n_threads <= 0) { + params.n_threads = std::thread::hardware_concurrency(); + } } else if (arg == "-p" || arg == "--prompt") { if (++i >= argc) { invalid_param = true; @@ -168,6 +171,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_ctx = std::stoi(argv[i]); + } else if (arg == "-gqa" || arg == "--gqa") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_gqa = std::stoi(argv[i]); } else if (arg == "--rope-freq-base") { if (++i >= argc) { invalid_param = true; @@ -458,91 +467,92 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -i, --interactive run in interactive mode\n"); - fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n"); - fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); - fprintf(stderr, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); - fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); - fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n"); - fprintf(stderr, " (can be specified more than once for multiple prompts).\n"); - fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); - fprintf(stderr, " prompt to start generation with (default: empty)\n"); - fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); - fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); - fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); - fprintf(stderr, " not supported with --interactive or other interactive options\n"); - fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); - fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); - fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); - fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); - fprintf(stderr, " -f FNAME, --file FNAME\n"); - fprintf(stderr, " prompt file to start generation.\n"); - fprintf(stderr, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); - fprintf(stderr, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); - fprintf(stderr, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); - fprintf(stderr, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); - fprintf(stderr, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); - fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); - fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); - fprintf(stderr, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); - fprintf(stderr, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); - fprintf(stderr, " --mirostat N use Mirostat sampling.\n"); - fprintf(stderr, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); - fprintf(stderr, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); - fprintf(stderr, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); - fprintf(stderr, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); - fprintf(stderr, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); - fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n"); - fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); - fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); - fprintf(stderr, " --cfg-negative-prompt PROMPT \n"); - fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n"); - fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); - fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); - fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); - fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); - fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); - fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); - fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp); - fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " --perplexity compute perplexity over each ctx window of the prompt\n"); - fprintf(stderr, " --perplexity-lines compute perplexity over each line of the prompt\n"); - fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); + fprintf(stdout, "usage: %s [options]\n", argv[0]); + fprintf(stdout, "\n"); + fprintf(stdout, "options:\n"); + fprintf(stdout, " -h, --help show this help message and exit\n"); + fprintf(stdout, " -i, --interactive run in interactive mode\n"); + fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n"); + fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); + fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n"); + fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n"); + fprintf(stdout, " (can be specified more than once for multiple prompts).\n"); + fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n"); + fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); + fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stdout, " -p PROMPT, --prompt PROMPT\n"); + fprintf(stdout, " prompt to start generation with (default: empty)\n"); + fprintf(stdout, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); + fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); + fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); + fprintf(stdout, " not supported with --interactive or other interactive options\n"); + fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); + fprintf(stdout, " --random-prompt start with a randomized prompt.\n"); + fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); + fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); + fprintf(stdout, " -f FNAME, --file FNAME\n"); + fprintf(stdout, " prompt file to start generation.\n"); + fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); + fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa); + fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); + fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); + fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); + fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); + fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); + fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); + fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); + fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); + fprintf(stdout, " --mirostat N use Mirostat sampling.\n"); + fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); + fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); + fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); + fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); + fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); + fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n"); + fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); + fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); + fprintf(stdout, " --cfg-negative-prompt PROMPT \n"); + fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); + fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); + fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); + fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); + fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); + fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); + fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); + fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp); + fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n"); + fprintf(stdout, " --perplexity-lines compute perplexity over each line of the prompt\n"); + fprintf(stdout, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); + fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); if (llama_mlock_supported()) { - fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } if (llama_mmap_supported()) { - fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } - fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n"); - fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n"); - fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); + fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n"); + fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n"); + fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); - fprintf(stderr, " number of layers to store in VRAM\n"); - fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); - fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); + fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); + fprintf(stdout, " number of layers to store in VRAM\n"); + fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); + fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); + fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); #endif - fprintf(stderr, " --mtest compute maximum memory usage\n"); - fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); - fprintf(stderr, " --verbose-prompt print prompt before generation\n"); - fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, "\n"); + fprintf(stdout, " --mtest compute maximum memory usage\n"); + fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); + fprintf(stdout, " --verbose-prompt print prompt before generation\n"); + fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + fprintf(stdout, " -m FNAME, --model FNAME\n"); + fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); + fprintf(stdout, "\n"); } std::string gpt_random_prompt(std::mt19937 & rng) { @@ -580,6 +590,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param lparams.n_ctx = params.n_ctx; lparams.n_batch = params.n_batch; + lparams.n_gqa = params.n_gqa; lparams.n_gpu_layers = params.n_gpu_layers; lparams.main_gpu = params.main_gpu; lparams.tensor_split = params.tensor_split; diff --git a/examples/common.h b/examples/common.h index c936de6fa..fb8f6d65f 100644 --- a/examples/common.h +++ b/examples/common.h @@ -27,6 +27,7 @@ struct gpt_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams) int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_gpu_layers = 0; // number of layers to store in VRAM @@ -47,7 +48,7 @@ struct gpt_params { int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float frequency_penalty = 0.00f; // 0.0 = disabled float presence_penalty = 0.00f; // 0.0 = disabled - int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate diff --git a/examples/llm.vim b/examples/llm.vim index 16e308c38..efecad0cd 100644 --- a/examples/llm.vim +++ b/examples/llm.vim @@ -2,57 +2,22 @@ function! Llm() let url = "http://127.0.0.1:8080/completion" - " Save the current cursor position - let save_cursor = getpos('.') - - silent! %s/\n/\\n/g - silent! %s/\t/\\t/g - silent! %s/\\n$// - " Get the content of the current buffer let buffer_content = join(getline(1, '$'), "\n") - " Replace true newlines with "\n" - let buffer_content = substitute(buffer_content, '\n', '\\n', 'g') - - " Trim leading/trailing whitespace - let buffer_content = substitute(buffer_content, '^\s\+', '', '') - let buffer_content = substitute(buffer_content, '\s\+$', '', '') - " Create the JSON payload - " can't escape backslash, \n gets replaced as \\n - let json_payload = '{"prompt":"' . escape(buffer_content, '"/') . '","temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream":false}' - - let prompt_tmpfile = tempname() - let response_tmpfile = tempname() - call writefile([json_payload], prompt_tmpfile) + let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false} + let json_payload.prompt = buffer_content " Define the curl command - let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -o ' . shellescape(response_tmpfile) . ' -d @' . shellescape(prompt_tmpfile) . ' ' . url - silent execute '!'.curl_command - - let response = join(readfile(response_tmpfile), '') - let start_marker = '{"content":"' - let end_marker = '","generation_settings' - let content_start = stridx(response, start_marker) + len(start_marker) - let content_end = stridx(response, end_marker, content_start) + let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url + let response = system(curl_command, json_encode(json_payload)) " Extract the content field from the response - let content = strpart(response, content_start, content_end - content_start) + let content = json_decode(response).content " Insert the content at the cursor position call setline(line('.'), getline('.') . content) - - " Replace newline "\n" strings with actual newlines in the content - silent! %s/\\n/\r/g - " and tabs - silent! %s/\\t/\t/g - " and quote marks for C sources - silent! %s/\\"/\"/g - - " Remove the temporary file - call delete(prompt_tmpfile) - call delete(response_tmpfile) endfunction command! Llm call Llm() diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 4b4cd1de4..3bd8ba262 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -93,8 +93,8 @@ int main(int argc, char ** argv) { } if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);" - " you are on your own\n", __func__, params.n_ctx); + // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048 + fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx); } else if (params.n_ctx < 8) { fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 47d2f3434..2c6aebe03 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1787,11 +1787,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons } } -static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) { +static __global__ void mul_mat_p021_f16_f32( + const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) { + const half * x = (const half *) vx; const int row_x = blockDim.y*blockIdx.y + threadIdx.y; const int channel = blockDim.z*blockIdx.z + threadIdx.z; + const int channel_x = channel / (nchannels_y / nchannels_x); const int nrows_y = ncols_x; const int nrows_dst = nrows_x; @@ -1807,7 +1811,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const } // x is transposed and permuted - const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x; + const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x; const float xi = __half2float(x[ix]); const int row_y = col_x; @@ -1835,12 +1839,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, - const int row_stride_x, const int channel_stride_x) { + const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) { const half * x = (const half *) vx; const int row_x = blockDim.y*blockIdx.y + threadIdx.y; const int channel = blockDim.z*blockIdx.z + threadIdx.z; + const int channel_x = channel / channel_x_divisor; const int nrows_y = ncols_x; const int nrows_dst = nrows_x; @@ -1857,7 +1862,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous break; } - const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x; + const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x; const float xi = __half2float(x[ix]); const int row_y = col_x; @@ -2366,20 +2371,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { } } -static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) { - const dim3 block_nums(1, nrows_x, nchannels_x); +static void ggml_mul_mat_p021_f16_f32_cuda( + const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, + const int nchannels_x, const int nchannels_y, cudaStream_t stream) { + + const dim3 block_nums(1, nrows_x, nchannels_y); const dim3 block_dims(WARP_SIZE, 1, 1); - mul_mat_p021_f16_f32<<>>(vx, y, dst, ncols_x, nrows_x, nchannels_x); + mul_mat_p021_f16_f32<<>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y); } static void ggml_mul_mat_vec_nc_f16_f32_cuda( const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x, - const int nchannels_x, const int channel_stride_x, cudaStream_t stream) { + const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) { - const dim3 block_nums(1, nrows_x, nchannels_x); + const dim3 block_nums(1, nrows_x, nchannels_y); const dim3 block_dims(WARP_SIZE, 1, 1); mul_mat_vec_nc_f16_f32<<>> - (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x); + (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x); } static void ggml_cpy_f32_f32_cuda( @@ -2726,6 +2734,7 @@ inline void ggml_cuda_op_mul( (void) dst; (void) src0_ddq_i; (void) i02; + (void) i1; } inline void ggml_cuda_op_gelu( @@ -3021,15 +3030,15 @@ inline void ggml_cuda_op_rope( const int64_t ne00 = src0->ne[0]; const int64_t i01_diff = i01_high - i01_low; - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; - + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; // RoPE alteration for extended context + float freq_base, freq_scale; - memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); const float theta_scale = powf(freq_base, -2.0f/n_dims); const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale; @@ -3045,6 +3054,7 @@ inline void ggml_cuda_op_rope( rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); } + (void) src1; (void) dst; (void) src0_ddq_i; (void) src1_ddf_i; @@ -3063,11 +3073,12 @@ inline void ggml_cuda_op_diag_mask_inf( const int64_t ne01 = src0->ne[1]; const int64_t i01_diff = i01_high - i01_low; - const int n_past = ((int32_t *) src1->data)[0]; + const int n_past = ((int32_t *) dst->op_params)[0]; // compute diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main); + (void) src1; (void) dst; (void) src0_ddq_i; (void) src1_ddf_i; @@ -3135,6 +3146,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm const int64_t ne11 = use_src1 ? src1->ne[1] : 1; const int64_t ne12 = use_src1 ? src1->ne[2] : 1; const int64_t ne13 = use_src1 ? src1->ne[3] : 1; + const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; + + GGML_ASSERT(ne03 == ne13); const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; @@ -3146,12 +3160,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); // strides for iteration over dims 3 and 2 - const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03; - const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1; + const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13; + const int64_t num_iters = flatten_rows ? 1 : num_iters_0; + const int64_t stride_mod = flatten_rows ? num_iters_0 : 1; const int64_t src0_stride = ne00 * ne01 * stride_mod; const int64_t src1_stride = ne10 * ne11 * stride_mod; const int64_t dst_stride = ne0 * ne1 * stride_mod; + const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01; + const int64_t i03_max = flatten_rows ? 1 : ne03; + const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12); + const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02; + GGML_ASSERT(!(flatten_rows && ne02 < ne12)); + const size_t src0_ts = ggml_type_size(src0->type); const size_t src0_bs = ggml_blck_size(src0->type); @@ -3168,6 +3189,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE); const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + GGML_ASSERT(!(split && ne02 < ne12)); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); @@ -3204,7 +3226,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1]; } else { row_low = 0; - row_high = nrows0; + row_high = nrows0*i02_divisor; } if (row_low == row_high) { continue; @@ -3252,16 +3274,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]); } - const int64_t i03_max = flatten_rows ? 1 : ne03; - const int64_t i02_max = flatten_rows ? 1 : ne02; - const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01; - for (int64_t i03 = 0; i03 < i03_max; i03++) { const int64_t i13 = i03 % ne13; for (int64_t i02 = 0; i02 < i02_max; i02++) { const int64_t i12 = i02 % ne12; - const int64_t i0 = i03*ne02 + i02; + const int64_t i0 = i03*i02_max + i02; // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs const int64_t i0_offset_low = row_low/rows_per_iter; @@ -3295,10 +3313,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm const int64_t i11 = i13*ne12 + i12; // for split tensors the data begins at i0 == i0_offset_low - char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs; - float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride; + char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs; + float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride; float * src1_ddf_i = src1_ddf[id] + i11*src1_stride; - float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride; + float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride; // for split tensors the data pointer needs to be rounded down // to the bin edge for i03, i02 bins beyond the first @@ -3337,11 +3355,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm } } - if (!src0_on_device || !src0_is_contiguous) { + if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { if (src0_is_f32) { - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main)); } else { - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main)); } } @@ -3495,6 +3513,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; + const int64_t ne12 = src1->ne[2]; + CUDA_CHECK(cudaSetDevice(g_main_device)); cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; @@ -3507,7 +3527,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; - ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main); + ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main); } void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ @@ -3521,6 +3541,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; + const int64_t ne12 = src1->ne[2]; + const int64_t nb01 = src0->nb[1]; const int64_t nb02 = src0->nb[2]; @@ -3539,7 +3561,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 const int row_stride_x = nb01 / sizeof(half); const int channel_stride_x = nb02 / sizeof(half); - ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main); + ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main); } void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3776,7 +3798,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t offset = 0; if (tensor->op == GGML_OP_VIEW) { - memcpy(&offset, tensor->src[2]->data, sizeof(size_t)); + memcpy(&offset, tensor->op_params, sizeof(size_t)); } extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = src0_ddc + offset; diff --git a/ggml-metal.m b/ggml-metal.m index 2810fa2a8..bf3f68fe4 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -42,6 +42,7 @@ struct ggml_metal_context { id pipeline_##name GGML_METAL_DECL_KERNEL(add); + GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast GGML_METAL_DECL_KERNEL(mul); GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast GGML_METAL_DECL_KERNEL(scale); @@ -157,6 +158,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); GGML_METAL_ADD_KERNEL(add); + GGML_METAL_ADD_KERNEL(add_row); GGML_METAL_ADD_KERNEL(mul); GGML_METAL_ADD_KERNEL(mul_row); GGML_METAL_ADD_KERNEL(scale); @@ -464,10 +466,16 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } - [encoder setComputePipelineState:ctx->pipeline_add]; + if (ggml_nelements(src1) == ne10) { + // src1 is a row + [encoder setComputePipelineState:ctx->pipeline_add_row]; + } else { + [encoder setComputePipelineState:ctx->pipeline_add]; + } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; const int64_t n = ggml_nelements(dst); @@ -577,7 +585,7 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } - const int n_past = ((int32_t *)(src1->data))[0]; + const int n_past = ((int32_t *)(dst->op_params))[0]; [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -842,9 +850,10 @@ void ggml_metal_graph_compute( GGML_ASSERT((src0t == GGML_TYPE_F32)); - const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past); - const int n_head = ((int32_t *) src1->data)[1]; - const float max_bias = ((float *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); if (__builtin_popcount(n_head) != 1) { GGML_ASSERT(false && "only power-of-two n_head implemented"); @@ -882,15 +891,14 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - - const int n_past = ((int32_t *)(src1->data))[0]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; float freq_base; float freq_scale; - memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -919,7 +927,9 @@ void ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_OP_DUP: case GGML_OP_CPY: + case GGML_OP_CONT: { if (encoder == nil) { encoder = [command_buffer computeCommandEncoder]; diff --git a/ggml-metal.metal b/ggml-metal.metal index 5a9a6d842..987376d56 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -67,6 +67,17 @@ kernel void kernel_add( dst[tpig] = src0[tpig] + src1[tpig]; } +// assumption: src1 is a row +// broadcast src1 into src0 +kernel void kernel_add_row( + device const float * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] + src1[tpig % ne00]; +} + kernel void kernel_mul( device const float * src0, device const float * src1, diff --git a/ggml.c b/ggml.c index 5397f159c..4c0b28aa8 100644 --- a/ggml.c +++ b/ggml.c @@ -4591,6 +4591,7 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ GGML_OP_NONE, + /*.op_params =*/ {0}, /*.is_param =*/ false, /*.grad =*/ NULL, /*.src =*/ { NULL }, @@ -4970,6 +4971,11 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * return tensor; } +static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { + assert(params_size <= GGML_MAX_OP_PARAMS); + memcpy(tensor->op_params, params, params_size); +} + struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, const struct ggml_tensor * src) { @@ -5020,7 +5026,6 @@ struct ggml_tensor * ggml_dup_impl( result->op = GGML_OP_DUP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5144,23 +5149,13 @@ struct ggml_tensor * ggml_acc_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5); - - ((int32_t *) c->data)[0] = nb1; - ((int32_t *) c->data)[1] = nb2; - ((int32_t *) c->data)[2] = nb3; - ((int32_t *) c->data)[3] = offset; - ((int32_t *) c->data)[4] = inplace ? 1 : 0; - - ggml_scratch_load(ctx); + int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ACC; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = c; return result; } @@ -5333,7 +5328,6 @@ struct ggml_tensor * ggml_sqr_impl( result->op = GGML_OP_SQR; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5367,7 +5361,6 @@ struct ggml_tensor * ggml_sqrt_impl( result->op = GGML_OP_SQRT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5402,7 +5395,6 @@ struct ggml_tensor * ggml_log_impl( result->op = GGML_OP_LOG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5435,7 +5427,6 @@ struct ggml_tensor * ggml_sum( result->op = GGML_OP_SUM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5462,7 +5453,6 @@ struct ggml_tensor * ggml_sum_rows( result->op = GGML_OP_SUM_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5485,7 +5475,6 @@ struct ggml_tensor * ggml_mean( result->op = GGML_OP_MEAN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5509,7 +5498,6 @@ struct ggml_tensor * ggml_argmax( result->op = GGML_OP_ARGMAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5587,7 +5575,6 @@ struct ggml_tensor * ggml_abs_impl( result->op = GGML_OP_ABS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5622,7 +5609,6 @@ struct ggml_tensor * ggml_sgn_impl( result->op = GGML_OP_SGN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5656,7 +5642,6 @@ struct ggml_tensor * ggml_neg_impl( result->op = GGML_OP_NEG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5690,7 +5675,6 @@ struct ggml_tensor * ggml_step_impl( result->op = GGML_OP_STEP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5724,7 +5708,6 @@ struct ggml_tensor * ggml_tanh_impl( result->op = GGML_OP_TANH; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5758,7 +5741,6 @@ struct ggml_tensor * ggml_elu_impl( result->op = GGML_OP_ELU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5792,7 +5774,6 @@ struct ggml_tensor * ggml_relu_impl( result->op = GGML_OP_RELU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5826,7 +5807,6 @@ struct ggml_tensor * ggml_gelu_impl( result->op = GGML_OP_GELU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5860,7 +5840,6 @@ struct ggml_tensor * ggml_gelu_quick_impl( result->op = GGML_OP_GELU_QUICK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5894,7 +5873,6 @@ struct ggml_tensor * ggml_silu_impl( result->op = GGML_OP_SILU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5949,10 +5927,11 @@ struct ggml_tensor * ggml_norm_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + // TODO: maybe store epsilon here? + result->op = GGML_OP_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; // TODO: maybe store epsilon here? return result; } @@ -5981,10 +5960,11 @@ struct ggml_tensor * ggml_rms_norm_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + // TODO: maybe store epsilon here? + result->op = GGML_OP_RMS_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; // TODO: maybe store epsilon here? return result; } @@ -6137,23 +6117,13 @@ struct ggml_tensor * ggml_set_impl( // make a view of the destination struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5); - - (( int32_t * ) c->data)[0] = nb1; - (( int32_t * ) c->data)[1] = nb2; - (( int32_t * ) c->data)[2] = nb3; - (( int32_t * ) c->data)[3] = offset; - (( int32_t * ) c->data)[4] = inplace ? 1 : 0; - - ggml_scratch_load(ctx); + int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_SET; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = c; return result; } @@ -6278,7 +6248,6 @@ struct ggml_tensor * ggml_cont_impl( result->op = GGML_OP_CONT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6322,7 +6291,6 @@ struct ggml_tensor * ggml_reshape( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6347,7 +6315,6 @@ struct ggml_tensor * ggml_reshape_1d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6373,7 +6340,6 @@ struct ggml_tensor * ggml_reshape_2d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6400,7 +6366,6 @@ struct ggml_tensor * ggml_reshape_3d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6429,7 +6394,6 @@ struct ggml_tensor * ggml_reshape_4d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6451,19 +6415,11 @@ struct ggml_tensor * ggml_view_1d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - ggml_scratch_save(ctx); - - struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); - - ggml_scratch_load(ctx); + ggml_set_op_params(result, &offset, sizeof(offset)); result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = offs; return result; } @@ -6489,13 +6445,7 @@ struct ggml_tensor * ggml_view_2d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - ggml_scratch_save(ctx); - - struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); - - ggml_scratch_load(ctx); + ggml_set_op_params(result, &offset, sizeof(offset)); result->nb[1] = nb1; result->nb[2] = result->nb[1]*ne1; @@ -6504,8 +6454,6 @@ struct ggml_tensor * ggml_view_2d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = offs; return result; } @@ -6533,13 +6481,7 @@ struct ggml_tensor * ggml_view_3d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - ggml_scratch_save(ctx); - - struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); - - ggml_scratch_load(ctx); + ggml_set_op_params(result, &offset, sizeof(offset)); result->nb[1] = nb1; result->nb[2] = nb2; @@ -6548,8 +6490,6 @@ struct ggml_tensor * ggml_view_3d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = offs; return result; } @@ -6579,13 +6519,7 @@ struct ggml_tensor * ggml_view_4d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - ggml_scratch_save(ctx); - - struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); - - ggml_scratch_load(ctx); + ggml_set_op_params(result, &offset, sizeof(offset)); result->nb[1] = nb1; result->nb[2] = nb2; @@ -6594,8 +6528,6 @@ struct ggml_tensor * ggml_view_4d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = offs; return result; } @@ -6656,22 +6588,9 @@ struct ggml_tensor * ggml_permute( result->op = GGML_OP_PERMUTE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - if (is_node) { - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); - - ((int32_t *) b->data)[0] = axis0; - ((int32_t *) b->data)[1] = axis1; - ((int32_t *) b->data)[2] = axis2; - ((int32_t *) b->data)[3] = axis3; - - ggml_scratch_load(ctx); - - result->src[2] = b; - } + int32_t params[] = { axis0, axis1, axis2, axis3 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); return result; } @@ -6699,7 +6618,6 @@ struct ggml_tensor * ggml_transpose( result->op = GGML_OP_TRANSPOSE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6777,7 +6695,6 @@ struct ggml_tensor * ggml_diag( result->op = GGML_OP_DIAG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6798,19 +6715,12 @@ struct ggml_tensor * ggml_diag_mask_inf_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = inplace ? 1 : 0; - - ggml_scratch_load(ctx); + int32_t params[] = { n_past, inplace ? 1 : 0 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_DIAG_MASK_INF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -6845,20 +6755,12 @@ struct ggml_tensor * ggml_diag_mask_zero_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - ggml_set_name(b, "n_past, inplace"); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = inplace ? 1 : 0; - - ggml_scratch_load(ctx); + int32_t params[] = { n_past, inplace ? 1 : 0 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_DIAG_MASK_ZERO; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -6894,7 +6796,6 @@ struct ggml_tensor * ggml_soft_max_impl( result->op = GGML_OP_SOFT_MAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6970,23 +6871,14 @@ struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = n_dims; - ((int32_t *) b->data)[2] = mode; - ((int32_t *) b->data)[3] = n_ctx; - memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float)); - memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float)); - - ggml_scratch_load(ctx); + int32_t params[6] = { n_past, n_dims, mode, n_ctx }; + memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 5, &freq_scale, sizeof(float)); + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_ROPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -7043,22 +6935,12 @@ struct ggml_tensor * ggml_rope_back( struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); - ggml_set_name(b, "n_past, n_dims, mode"); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = n_dims; - ((int32_t *) b->data)[2] = mode; - ((int32_t *) b->data)[3] = n_ctx; - - ggml_scratch_load(ctx); + int32_t params[] = { n_past, n_dims, mode, n_ctx }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_ROPE_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -7083,21 +6965,13 @@ struct ggml_tensor * ggml_alibi( //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = ggml_view_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = n_head; - GGML_ASSERT(sizeof(float) == sizeof(int32_t)); - (((float *) b->data)[2]) = bias_max; - - ggml_scratch_load(ctx); + int32_t op_params[3] = { n_past, n_head }; + memcpy(op_params + 2, &bias_max, sizeof(float)); + ggml_set_op_params(result, &op_params, sizeof(op_params)); result->op = GGML_OP_ALIBI; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -7119,19 +6993,12 @@ struct ggml_tensor * ggml_clamp( // TODO: when implement backward, fix this: struct ggml_tensor * result = ggml_view_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2); - - ((float *) b->data)[0] = min; - ((float *) b->data)[1] = max; - - ggml_scratch_load(ctx); + float params[] = { min, max }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_CLAMP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -7164,18 +7031,13 @@ GGML_API struct ggml_tensor * ggml_conv_1d( }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - ggml_scratch_save(ctx); - struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); - ((int32_t*)c->data)[0] = s0; - ((int32_t*)c->data)[1] = p0; - ((int32_t*)c->data)[2] = d0; - ggml_scratch_load(ctx); + int32_t params[] = { s0, p0, d0 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_CONV_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = c; return result; } @@ -7208,21 +7070,13 @@ struct ggml_tensor* ggml_conv_2d( }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - ggml_scratch_save(ctx); - struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6); - ((int32_t*)c->data)[0] = s0; - ((int32_t*)c->data)[1] = s1; - ((int32_t*)c->data)[2] = p0; - ((int32_t*)c->data)[3] = p1; - ((int32_t*)c->data)[4] = d0; - ((int32_t*)c->data)[5] = d1; - ggml_scratch_load(ctx); + int32_t params[] = { s0, s1, p0, p1, d0, d1 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_CONV_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = c; return result; @@ -7246,7 +7100,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) { return (ins + 2 * p - ks) / s + 1; } -// ggml_pool_2d +// ggml_pool_1d struct ggml_tensor* ggml_pool_1d( struct ggml_context * ctx, @@ -7269,18 +7123,12 @@ struct ggml_tensor* ggml_pool_1d( }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - ggml_scratch_save(ctx); - struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); - ((int32_t*)c->data)[0] = op; - ((int32_t*)c->data)[1] = k0; - ((int32_t*)c->data)[2] = s0; - ((int32_t*)c->data)[3] = p0; - ggml_scratch_load(ctx); + int32_t params[] = { op, k0, s0, p0 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_POOL_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = c; return result; } @@ -7312,21 +7160,12 @@ struct ggml_tensor* ggml_pool_2d( }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); - ggml_scratch_save(ctx); - struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7); - ((int32_t*)c->data)[0] = op; - ((int32_t*)c->data)[1] = k0; - ((int32_t*)c->data)[2] = k1; - ((int32_t*)c->data)[3] = s0; - ((int32_t*)c->data)[4] = s1; - ((int32_t*)c->data)[5] = p0; - ((int32_t*)c->data)[6] = p1; - ggml_scratch_load(ctx); + int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_POOL_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = c; return result; } @@ -7485,21 +7324,12 @@ struct ggml_tensor * ggml_win_part( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); - - ((int32_t *) b->data)[0] = npx; - ((int32_t *) b->data)[1] = npy; - ((int32_t *) b->data)[2] = w; - - ggml_scratch_load(ctx); + int32_t params[] = { npx, npy, w }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_WIN_PART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = b; return result; } @@ -7524,19 +7354,12 @@ struct ggml_tensor * ggml_win_unpart( const int64_t ne[4] = { a->ne[0], w0, h0, 1, }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); - - ((int32_t *) b->data)[0] = w; - - ggml_scratch_load(ctx); + int32_t params[] = { w }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_WIN_UNPART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = b; return result; } @@ -7554,19 +7377,13 @@ struct ggml_tensor * ggml_map_unary_impl_f32( is_node = true; } - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - - ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); result->op = GGML_OP_MAP_UNARY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[2] = addr_tensor; return result; } @@ -7601,20 +7418,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32( is_node = true; } - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - - ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); result->op = GGML_OP_MAP_BINARY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = addr_tensor; return result; } @@ -7648,19 +7459,13 @@ struct ggml_tensor * ggml_map_custom1_impl_f32( is_node = true; } - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - - ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); result->op = GGML_OP_MAP_CUSTOM1; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[2] = addr_tensor; return result; } @@ -7693,20 +7498,14 @@ struct ggml_tensor * ggml_map_custom2_impl_f32( is_node = true; } - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - - ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); result->op = GGML_OP_MAP_CUSTOM2; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = addr_tensor; return result; } @@ -7742,21 +7541,15 @@ struct ggml_tensor * ggml_map_custom3_impl_f32( is_node = true; } - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - - ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); result->op = GGML_OP_MAP_CUSTOM3; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = addr_tensor; - result->src[3] = c; + result->src[2] = c; return result; } @@ -8984,21 +8777,17 @@ static void ggml_compute_forward_acc_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - GGML_ASSERT(opt0->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(opt0) == 5); - // view src0 and dst with these strides and data offset inbytes during acc // nb0 is implicitely element_size because src0 and dst are contiguous - size_t nb1 = ((int32_t *) opt0->data)[0]; - size_t nb2 = ((int32_t *) opt0->data)[1]; - size_t nb3 = ((int32_t *) opt0->data)[2]; - size_t offset = ((int32_t *) opt0->data)[3]; - bool inplace = (bool) ((int32_t *) opt0->data)[4]; + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; if (!inplace && (params->type == GGML_TASK_INIT)) { // memcpy needs to be synchronized across threads to avoid race conditions. @@ -9067,13 +8856,12 @@ static void ggml_compute_forward_acc( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_acc_f32(params, src0, src1, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -11093,21 +10881,17 @@ static void ggml_compute_forward_set_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - GGML_ASSERT(opt0->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(opt0) == 5); - // view src0 and dst with these strides and data offset inbytes during set // nb0 is implicitely element_size because src0 and dst are contiguous - size_t nb1 = ((int32_t *) opt0->data)[0]; - size_t nb2 = ((int32_t *) opt0->data)[1]; - size_t nb3 = ((int32_t *) opt0->data)[2]; - size_t offset = ((int32_t *) opt0->data)[3]; - bool inplace = (bool) ((int32_t *) opt0->data)[4]; + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; if (!inplace && (params->type == GGML_TASK_INIT)) { // memcpy needs to be synchronized across threads to avoid race conditions. @@ -11167,13 +10951,12 @@ static void ggml_compute_forward_set( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_set_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_set_f32(params, src0, src1, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -11569,17 +11352,14 @@ static void ggml_compute_forward_diag( static void ggml_compute_forward_diag_mask_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const float value) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 2); const int ith = params->ith; const int nth = params->nth; - const int n_past = ((int32_t *) src1->data)[0]; - const bool inplace = (bool)((int32_t *) src1->data)[1]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const bool inplace = (bool)((int32_t *) dst->op_params)[1]; GGML_ASSERT(n_past >= 0); @@ -11622,12 +11402,11 @@ static void ggml_compute_forward_diag_mask_f32( static void ggml_compute_forward_diag_mask_inf( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY); + ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY); } break; default: { @@ -11639,12 +11418,11 @@ static void ggml_compute_forward_diag_mask_inf( static void ggml_compute_forward_diag_mask_zero( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0); + ggml_compute_forward_diag_mask_f32(params, src0, dst, 0); } break; default: { @@ -11842,20 +11620,17 @@ static void ggml_compute_forward_soft_max_back( static void ggml_compute_forward_alibi_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { assert(params->ith == 0); - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 3); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_head = ((int32_t *) src1->data)[1]; - const float max_bias = ((float *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); assert(n_past >= 0); @@ -11908,20 +11683,17 @@ static void ggml_compute_forward_alibi_f32( static void ggml_compute_forward_alibi_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { assert(params->ith == 0); - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 3); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_head = ((int32_t *) src1->data)[1]; - const float max_bias = ((float *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); assert(n_past >= 0); @@ -11974,16 +11746,15 @@ static void ggml_compute_forward_alibi_f16( static void ggml_compute_forward_alibi( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_alibi_f16(params, src0, src1, dst); + ggml_compute_forward_alibi_f16(params, src0, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_alibi_f32(params, src0, src1, dst); + ggml_compute_forward_alibi_f32(params, src0, dst); } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -12013,19 +11784,17 @@ static void ggml_compute_forward_alibi( static void ggml_compute_forward_clamp_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { assert(params->ith == 0); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_nelements(src1) == 2); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const float min = ((float *) src1->data)[0]; - const float max = ((float *) src1->data)[1]; + float min; + float max; + memcpy(&min, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); const int ith = params->ith; const int nth = params->nth; @@ -12055,12 +11824,11 @@ static void ggml_compute_forward_clamp_f32( static void ggml_compute_forward_clamp( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_clamp_f32(params, src0, src1, dst); + ggml_compute_forward_clamp_f32(params, src0, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -12090,10 +11858,7 @@ static void ggml_compute_forward_clamp( static void ggml_compute_forward_rope_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 6); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -12102,12 +11867,12 @@ static void ggml_compute_forward_rope_f32( float freq_base; float freq_scale; - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; - memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); assert(n_past >= 0); @@ -12222,10 +11987,7 @@ static void ggml_compute_forward_rope_f32( static void ggml_compute_forward_rope_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 6); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -12234,12 +11996,12 @@ static void ggml_compute_forward_rope_f16( float freq_base; float freq_scale; - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; - memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); assert(n_past >= 0); @@ -12354,16 +12116,15 @@ static void ggml_compute_forward_rope_f16( static void ggml_compute_forward_rope( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_f16(params, src0, src1, dst); + ggml_compute_forward_rope_f16(params, src0, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_f32(params, src0, src1, dst); + ggml_compute_forward_rope_f32(params, src0, dst); } break; default: { @@ -12377,10 +12138,7 @@ static void ggml_compute_forward_rope( static void ggml_compute_forward_rope_back_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 4); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -12390,9 +12148,9 @@ static void ggml_compute_forward_rope_back_f32( // dx = rope_back(dy, src1) // src0 is dy, src1 contains options - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; assert(n_past >= 0); @@ -12476,10 +12234,7 @@ static void ggml_compute_forward_rope_back_f32( static void ggml_compute_forward_rope_back_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -12489,9 +12244,9 @@ static void ggml_compute_forward_rope_back_f16( // dx = rope_back(dy, src1) // src0 is dy, src1 contains options - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; assert(n_past >= 0); @@ -12575,16 +12330,15 @@ static void ggml_compute_forward_rope_back_f16( static void ggml_compute_forward_rope_back( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_back_f16(params, src0, src1, dst); + ggml_compute_forward_rope_back_f16(params, src0, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_back_f32(params, src0, src1, dst); + ggml_compute_forward_rope_back_f32(params, src0, dst); } break; default: { @@ -12781,7 +12535,7 @@ static void ggml_compute_forward_conv_1d_s1_ph( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst) { + struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { @@ -12984,7 +12738,7 @@ static void ggml_compute_forward_conv_1d_s2_ph( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst) { + struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { @@ -13004,14 +12758,13 @@ static void ggml_compute_forward_conv_1d_s2_ph( // ggml_compute_forward_conv_1d static void ggml_compute_forward_conv_1d( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst) { - const int32_t s0 = ((const int32_t*)(opt0->data))[0]; - const int32_t p0 = ((const int32_t*)(opt0->data))[1]; - const int32_t d0 = ((const int32_t*)(opt0->data))[2]; + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; GGML_ASSERT(d0 == 1); // dilation not supported GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported if (s0 == 1) { @@ -13029,7 +12782,6 @@ static void ggml_compute_forward_conv_2d_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13049,12 +12801,12 @@ static void ggml_compute_forward_conv_2d_f16_f32( // size of the convolution row - the kernel size unrolled across all channels const int ew0 = nk0*nk1*ne02; - const int32_t s0 = ((const int32_t*)(opt0->data))[0]; - const int32_t s1 = ((const int32_t*)(opt0->data))[1]; - const int32_t p0 = ((const int32_t*)(opt0->data))[2]; - const int32_t p1 = ((const int32_t*)(opt0->data))[3]; - const int32_t d0 = ((const int32_t*)(opt0->data))[4]; - const int32_t d1 = ((const int32_t*)(opt0->data))[5]; + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); @@ -13126,17 +12878,15 @@ static void ggml_compute_forward_conv_2d( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst - ) { + struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst); + //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst); GGML_ASSERT(false); } break; default: @@ -13201,12 +12951,11 @@ static void ggml_compute_forward_pool_1d_sk_p0( // ggml_compute_forward_pool_1d static void ggml_compute_forward_pool_1d( - const struct ggml_compute_params* params, - const struct ggml_tensor* src0, - const struct ggml_tensor* opt0, - struct ggml_tensor* dst) { - GGML_ASSERT(opt0->ne[0] == 4); - const int* opts = (const int*)opt0->data; + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + const int32_t* opts = (const int32_t*)dst->op_params; enum ggml_op_pool op = opts[0]; const int k0 = opts[1]; const int s0 = opts[2]; @@ -13220,12 +12969,12 @@ static void ggml_compute_forward_pool_1d( // ggml_compute_forward_pool_2d_sk_p0 static void ggml_compute_forward_pool_2d_sk_p0( - const struct ggml_compute_params * params, - const enum ggml_op_pool op, - const struct ggml_tensor * src, - const int k0, - const int k1, - struct ggml_tensor * dst) { + const struct ggml_compute_params * params, + const enum ggml_op_pool op, + const struct ggml_tensor * src, + const int k0, + const int k1, + struct ggml_tensor * dst) { assert(src->type == GGML_TYPE_F32); assert(params->ith == 0); @@ -13285,12 +13034,11 @@ static void ggml_compute_forward_pool_2d_sk_p0( // ggml_compute_forward_pool_2d static void ggml_compute_forward_pool_2d( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst) { - GGML_ASSERT(opt0->ne[0] == 7); - const int* opts = (const int*)opt0->data; + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + const int32_t * opts = (const int32_t *)dst->op_params; enum ggml_op_pool op = opts[0]; const int k0 = opts[1]; const int k1 = opts[2]; @@ -13315,7 +13063,7 @@ static void ggml_compute_forward_flash_attn_f32( const struct ggml_tensor * k, const struct ggml_tensor * v, const bool masked, - struct ggml_tensor * dst) { + struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -13493,7 +13241,7 @@ static void ggml_compute_forward_flash_attn_f16( const struct ggml_tensor * k, const struct ggml_tensor * v, const bool masked, - struct ggml_tensor * dst) { + struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -14258,7 +14006,6 @@ static void ggml_compute_forward_flash_attn_back( static void ggml_compute_forward_win_part_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -14267,9 +14014,9 @@ static void ggml_compute_forward_win_part_f32( GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - const int32_t nep0 = ((const int32_t *)(opt0->data))[0]; - const int32_t nep1 = ((const int32_t *)(opt0->data))[1]; - const int32_t w = ((const int32_t *)(opt0->data))[2]; + const int32_t nep0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t nep1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t w = ((const int32_t *)(dst->op_params))[2]; assert(ne00 == ne0); assert(ne3 == nep0*nep1); @@ -14303,12 +14050,11 @@ static void ggml_compute_forward_win_part_f32( static void ggml_compute_forward_win_part( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_win_part_f32(params, src0, opt0, dst); + ggml_compute_forward_win_part_f32(params, src0, dst); } break; default: { @@ -14322,7 +14068,6 @@ static void ggml_compute_forward_win_part( static void ggml_compute_forward_win_unpart_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -14331,7 +14076,7 @@ static void ggml_compute_forward_win_unpart_f32( GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - const int32_t w = ((const int32_t *)(opt0->data))[0]; + const int32_t w = ((const int32_t *)(dst->op_params))[0]; // padding const int px = (w - ne1%w)%w; @@ -14365,12 +14110,11 @@ static void ggml_compute_forward_win_unpart_f32( static void ggml_compute_forward_win_unpart( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst); + ggml_compute_forward_win_unpart_f32(params, src0, dst); } break; default: { @@ -14889,7 +14633,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_ACC: { - ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_SUB: { @@ -15009,7 +14753,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_SET: { - ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_CPY: { @@ -15049,11 +14793,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_DIAG_MASK_INF: { - ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor); } break; case GGML_OP_DIAG_MASK_ZERO: { - ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor); } break; case GGML_OP_SOFT_MAX: { @@ -15065,35 +14809,35 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_ROPE: { - ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rope(params, tensor->src[0], tensor); } break; case GGML_OP_ROPE_BACK: { - ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rope_back(params, tensor->src[0], tensor); } break; case GGML_OP_ALIBI: { - ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_alibi(params, tensor->src[0], tensor); } break; case GGML_OP_CLAMP: { - ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_clamp(params, tensor->src[0], tensor); } break; case GGML_OP_CONV_1D: { - ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_CONV_2D: { - ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_POOL_1D: { - ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_pool_1d(params, tensor->src[0], tensor); } break; case GGML_OP_POOL_2D: { - ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_pool_2d(params, tensor->src[0], tensor); } break; case GGML_OP_FLASH_ATTN: { @@ -15115,40 +14859,45 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_WIN_PART: { - ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor); + ggml_compute_forward_win_part(params, tensor->src[0], tensor); } break; case GGML_OP_WIN_UNPART: { - ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor); + ggml_compute_forward_win_unpart(params, tensor->src[0], tensor); } break; case GGML_OP_MAP_UNARY: { - const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data); + ggml_unary_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun); } break; case GGML_OP_MAP_BINARY: { - const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data); + ggml_binary_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); } break; case GGML_OP_MAP_CUSTOM1: { - const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data); + ggml_custom1_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun); } break; case GGML_OP_MAP_CUSTOM2: { - const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data); + ggml_custom2_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun); } break; case GGML_OP_MAP_CUSTOM3: { - const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data); - ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun); + ggml_custom3_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); } break; case GGML_OP_CROSS_ENTROPY_LOSS: @@ -15212,12 +14961,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5); - GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32); - const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0]; - const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1]; - const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2]; - const size_t offset = (( int32_t * ) tensor->src[2]->data)[3]; + const size_t nb1 = ((int32_t *) tensor->op_params)[0]; + const size_t nb2 = ((int32_t *) tensor->op_params)[1]; + const size_t nb3 = ((int32_t *) tensor->op_params)[2]; + const size_t offset = ((int32_t *) tensor->op_params)[3]; struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx, tensor->grad, @@ -15525,12 +15272,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_SET: { - GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5); - GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32); - const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0]; - const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1]; - const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2]; - const size_t offset = (( int32_t * ) tensor->src[2]->data)[3]; + const size_t nb1 = ((int32_t *) tensor->op_params)[0]; + const size_t nb2 = ((int32_t *) tensor->op_params)[1]; + const size_t nb3 = ((int32_t *) tensor->op_params)[2]; + const size_t offset = ((int32_t *) tensor->op_params)[3]; struct ggml_tensor * tensor_grad_view = NULL; @@ -15607,8 +15352,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor if (src0->grad) { size_t offset; - GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2])); - memcpy(&offset, tensor->src[2]->data, sizeof(offset)); + memcpy(&offset, tensor->op_params, sizeof(offset)); size_t nb1 = tensor->nb[1]; size_t nb2 = tensor->nb[2]; @@ -15635,7 +15379,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - int32_t * axes = (int32_t *) tensor->src[2]->data; + int32_t * axes = (int32_t *) tensor->op_params; int axis0 = axes[0] & 0x3; int axis1 = axes[1] & 0x3; int axis2 = axes[2] & 0x3; @@ -15691,9 +15435,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); - const int n_past = ((int32_t *) src1->data)[0]; + const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = ggml_add_impl(ctx, src0->grad, ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), @@ -15707,9 +15449,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); - const int n_past = ((int32_t *) src1->data)[0]; + const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = ggml_add_impl(ctx, src0->grad, ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), @@ -15738,12 +15478,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 6); - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; + const int n_past = ((int32_t *) tensor->op_params)[0]; + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; src0->grad = ggml_add_impl(ctx, src0->grad, ggml_rope_back(ctx, @@ -15761,12 +15499,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_ROPE_BACK: { if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 4); - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; + const int n_past = ((int32_t *) tensor->op_params)[0]; + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; src0->grad = ggml_add_impl(ctx, src0->grad, ggml_rope(ctx, @@ -16547,10 +16283,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS_BACK: case GGML_OP_DIAG: - case GGML_OP_DIAG_MASK_ZERO: { n_tasks = 1; } break; + case GGML_OP_DIAG_MASK_ZERO: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: @@ -17294,7 +17030,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); uint64_t offs; - memcpy(&offs, args[2]->data, sizeof(offs)); + memcpy(&offs, tensor->op_params, sizeof(offs)); tensor->data = ((char *) tensor->data) + offs; } break; diff --git a/ggml.h b/ggml.h index 3da70c02e..3880031b3 100644 --- a/ggml.h +++ b/ggml.h @@ -199,6 +199,7 @@ #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 6 #define GGML_MAX_NAME 48 +#define GGML_MAX_OP_PARAMS 32 #define GGML_DEFAULT_N_THREADS 4 @@ -417,6 +418,9 @@ extern "C" { // compute data enum ggml_op op; + // op params - allocated as int32_t for alignment + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)]; + bool is_param; struct ggml_tensor * grad; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 7eafa55c3..bcaead4d4 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -472,6 +472,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in llama_ctx_params.main_gpu = cu_parseinfo_maindevice; llama_ctx_params.rope_freq_base = rope_freq_base; llama_ctx_params.rope_freq_scale = rope_freq_scale; + llama_ctx_params.n_batch = blasbatchsize; llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params); diff --git a/llama.cpp b/llama.cpp index fa95e5a99..17cd6cd33 100644 --- a/llama.cpp +++ b/llama.cpp @@ -68,6 +68,7 @@ enum e_model { MODEL_13B, MODEL_30B, MODEL_65B, + MODEL_70B, }; static const size_t kB = 1024; @@ -105,12 +106,12 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * static const std::map & MEM_REQ_SCRATCH0(int n_ctx) { static std::map k_sizes = { - /* empirical scaling, still a guess */ - { MODEL_3B, ((size_t) n_ctx / 12ull + 320ull) * MB }, - { MODEL_7B, ((size_t) n_ctx / 11ull + 440ull) * MB }, - { MODEL_13B, ((size_t) n_ctx / 10ull + 560ull) * MB }, - { MODEL_30B, ((size_t) n_ctx / 9ull + 680ull) * MB }, - { MODEL_65B, ((size_t) n_ctx / 6ull + 820ull) * MB }, + { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB }, + { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB }, + { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB }, + { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB }, + { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess + { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB }, }; return k_sizes; } @@ -118,11 +119,12 @@ static const std::map & MEM_REQ_SCRATCH0(int n_ctx) static const std::map & MEM_REQ_SCRATCH1() { static std::map k_sizes = { - { MODEL_3B, 256ull * MB }, - { MODEL_7B, 320ull * MB }, - { MODEL_13B, 400ull * MB }, - { MODEL_30B, 512ull * MB }, - { MODEL_65B, 840ull * MB }, // guess + { MODEL_3B, 128ull * MB }, + { MODEL_7B, 160ull * MB }, + { MODEL_13B, 192ull * MB }, + { MODEL_30B, 256ull * MB }, + { MODEL_65B, 384ull * MB }, // guess + { MODEL_70B, 304ull * MB }, }; return k_sizes; } @@ -131,11 +133,12 @@ static const std::map & MEM_REQ_SCRATCH1() static const std::map & MEM_REQ_EVAL() { static std::map k_sizes = { - { MODEL_3B, 16ull * MB }, - { MODEL_7B, 32ull * MB }, - { MODEL_13B, 48ull * MB }, - { MODEL_30B, 64ull * MB }, - { MODEL_65B, 96ull * MB }, // guess + { MODEL_3B, 8ull * MB }, + { MODEL_7B, 10ull * MB }, + { MODEL_13B, 12ull * MB }, + { MODEL_30B, 16ull * MB }, + { MODEL_65B, 24ull * MB }, // guess + { MODEL_70B, 24ull * MB }, }; return k_sizes; } @@ -150,6 +153,7 @@ static const std::map & VRAM_REQ_SCRATCH_BASE() { MODEL_13B, 640ull * kB }, { MODEL_30B, 768ull * kB }, { MODEL_65B, 1536ull * kB }, + { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced) }; return k_sizes; } @@ -164,19 +168,25 @@ static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() { MODEL_13B, 160ull }, { MODEL_30B, 208ull }, { MODEL_65B, 416ull }, + { MODEL_70B, 416ull }, // TODO (likely can be reduced) }; return k_sizes; } // default hparams (LLaMA 7B) struct llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? - uint32_t n_embd = 4096; - uint32_t n_mult = 256; - uint32_t n_head = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 256; + uint32_t n_head = 32; + uint32_t n_head_kv = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + + // LLaMAv2 + // TODO: load from model data hparams + float f_ffn_mult = 1.0f; float rope_freq_base = 10000.0f; float rope_freq_scale = 1.0f; @@ -184,12 +194,24 @@ struct llama_hparams { enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; bool operator!=(const llama_hparams & other) const { - return static_cast(memcmp(this, &other, sizeof(llama_hparams))); + return static_cast(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT + } + + uint32_t n_gqa() const { + return n_head/n_head_kv; + } + + uint32_t n_embd_head() const { + return n_embd/n_head; + } + + uint32_t n_embd_gqa() const { + return n_embd/n_gqa(); } size_t kv_size() const { size_t result = 2ull; - result *= (size_t) n_embd; + result *= (size_t) n_embd_gqa(); result *= (size_t) n_ctx; result *= (size_t) n_layer; result *= sizeof(ggml_fp16_t); @@ -495,12 +517,16 @@ struct llama_file_loader { } void read_hparams() { hparams.n_vocab = file.read_u32(); - hparams.n_embd = file.read_u32(); - hparams.n_mult = file.read_u32(); - hparams.n_head = file.read_u32(); + hparams.n_embd = file.read_u32(); + hparams.n_mult = file.read_u32(); + hparams.n_head = file.read_u32(); hparams.n_layer = file.read_u32(); - hparams.n_rot = file.read_u32(); - hparams.ftype = (enum llama_ftype) file.read_u32(); + hparams.n_rot = file.read_u32(); + hparams.ftype = (enum llama_ftype) file.read_u32(); + + // LLaMAv2 + // TODO: read from header + hparams.n_head_kv = hparams.n_head; } void read_vocab() { vocab.id_to_token.resize(hparams.n_vocab); @@ -799,7 +825,7 @@ static bool kv_cache_init( ggml_type wtype, int n_ctx, int n_gpu_layers) { - const int n_embd = hparams.n_embd; + const int n_embd = hparams.n_embd_gqa(); const int n_layer = hparams.n_layer; const int64_t n_mem = n_layer*n_ctx; @@ -843,6 +869,7 @@ struct llama_context_params llama_context_default_params() { /*.seed =*/ LLAMA_DEFAULT_SEED, /*.n_ctx =*/ 512, /*.n_batch =*/ 512, + /*.n_gqa =*/ 1, /*.gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, @@ -962,6 +989,7 @@ static const char *llama_model_type_name(e_model type) { case MODEL_13B: return "13B"; case MODEL_30B: return "30B"; case MODEL_65B: return "65B"; + case MODEL_70B: return "70B"; default: LLAMA_ASSERT(false); } } @@ -972,6 +1000,7 @@ static void llama_model_load_internal( llama_vocab & vocab, int n_ctx, int n_batch, + int n_gqa, int n_gpu_layers, int main_gpu, const float * tensor_split, @@ -986,6 +1015,7 @@ static void llama_model_load_internal( void * progress_callback_user_data) { model.t_start_us = ggml_time_us(); + size_t blasbatchmul = (n_batch>512?2:1); std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); @@ -993,6 +1023,7 @@ static void llama_model_load_internal( model.hparams = ml->file_loader->hparams; model.n_gpu_layers = n_gpu_layers; llama_file_version file_version = ml->file_loader->file_version; + auto & hparams = model.hparams; { @@ -1012,11 +1043,25 @@ static void llama_model_load_internal( hparams.n_ctx = n_ctx; + // LLaMAv2 + // TODO: temporary until GGUF + LLAMA_ASSERT(hparams.n_head % n_gqa == 0); + hparams.n_head_kv = hparams.n_head / n_gqa; + if (model.type == e_model::MODEL_65B && n_gqa == 8) { + fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); + model.type = e_model::MODEL_70B; + hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model + } + hparams.rope_freq_base = rope_freq_base; hparams.rope_freq_scale = rope_freq_scale; } - const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; + // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199 + const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3; + const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw; + const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; + //const uint32_t n_ff = 28672; { fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); @@ -1025,12 +1070,14 @@ static void llama_model_load_internal( fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa()); + fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } @@ -1100,9 +1147,10 @@ static void llama_model_load_internal( size_t vram_weights = 0; size_t vram_scratch = 0; { - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_vocab = hparams.n_vocab; + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_embd_gqa = hparams.n_embd_gqa(); + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; ml->ggml_ctx = ctx; @@ -1150,16 +1198,16 @@ static void llama_model_load_internal( layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend); - layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split); - layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split); - layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split); - layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split); + layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split); + layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split); + layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split); + layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split); layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); - layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split); - layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split); - layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split); + layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split); + layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split); + layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -1180,9 +1228,9 @@ static void llama_model_load_internal( const size_t mem_required = ctx_size + mmapped_size - vram_weights + // weights in VRAM not in memory - MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + - MEM_REQ_SCRATCH1().at(model.type) + - MEM_REQ_EVAL().at(model.type); + blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + + blasbatchmul*MEM_REQ_SCRATCH1().at(model.type) + + blasbatchmul*MEM_REQ_EVAL().at(model.type); // this is the memory required by one llama_state const size_t mem_required_state = @@ -1283,6 +1331,7 @@ static bool llama_model_load( llama_vocab & vocab, int n_ctx, int n_batch, + int n_gqa, int n_gpu_layers, int main_gpu, const float * tensor_split, @@ -1296,7 +1345,7 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { @@ -1340,17 +1389,22 @@ static bool llama_eval_internal( LLAMA_ASSERT(!!kv_self.ctx); - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - const int n_vocab = hparams.n_vocab; - const int n_rot = hparams.n_embd/hparams.n_head; - const int n_gpu_layers = model.n_gpu_layers; + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = hparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_vocab = hparams.n_vocab; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + LLAMA_ASSERT(n_embd_head == hparams.n_rot); const float freq_base = hparams.rope_freq_base; const float freq_scale = hparams.rope_freq_scale; + const int n_gpu_layers = model.n_gpu_layers; + auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; @@ -1448,11 +1502,11 @@ static bool llama_eval_internal( offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale); + struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale); + struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); @@ -1464,17 +1518,17 @@ static bool llama_eval_internal( offload_func_v(tmpv); ggml_set_name(tmpv, "tmpv"); - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N)); + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N)); offload_func_v(Vcur); ggml_set_name(Vcur, "Vcur"); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); offload_func_kq(k); ggml_set_name(k, "k"); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); offload_func_v(v); ggml_set_name(v, "v"); @@ -1493,8 +1547,8 @@ static bool llama_eval_internal( struct ggml_tensor * K = ggml_permute(ctx0, ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), - n_embd/n_head, n_head, n_past + N), + ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa), + n_embd_head, n_head_kv, n_past + N), 0, 2, 1, 3); offload_func_kq(K); ggml_set_name(K, "K"); @@ -1504,9 +1558,9 @@ static bool llama_eval_internal( offload_func_kq(KQ); ggml_set_name(KQ, "KQ"); - // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled = KQ / sqrt(n_embd_head) struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)); - ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)"); + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); // KQ_scaled shape [n_past + N, N, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); @@ -1526,10 +1580,10 @@ static bool llama_eval_internal( // split cached V into n_head heads struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd/n_head, n_head, + n_past + N, n_embd_head, n_head_kv, n_ctx*ggml_element_size(kv_self.v), - n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head, - il*n_ctx*ggml_element_size(kv_self.v)*n_embd); + n_ctx*ggml_element_size(kv_self.v)*n_embd_head, + n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il); offload_func_v(V); ggml_set_name(V, "V"); @@ -1541,7 +1595,7 @@ static bool llama_eval_internal( // make V contiguous in memory to speed up the matmul, however we waste time on the copy // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); + struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); #endif @@ -2695,7 +2749,7 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, + if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { @@ -2725,6 +2779,8 @@ struct llama_context * llama_new_context_with_model( params.seed = time(NULL); } + size_t blasbatchmul = (params.n_batch>512?2:1); + unsigned cur_percentage = 0; if (params.progress_callback == NULL) { params.progress_callback_user_data = &cur_percentage; @@ -2773,10 +2829,10 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); + ctx->buf_compute.resize(blasbatchmul*MEM_REQ_EVAL().at(ctx->model.type)); - ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); - ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); + ctx->buf_scratch[0].resize(blasbatchmul*MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); + ctx->buf_scratch[1].resize(blasbatchmul*MEM_REQ_SCRATCH1().at(ctx->model.type)); } #ifdef GGML_USE_METAL @@ -2797,7 +2853,7 @@ struct llama_context * llama_new_context_with_model( const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); #define LLAMA_METAL_CHECK_BUF(result) \ if (!(result)) { \ diff --git a/llama.h b/llama.h index bbf28e686..1089909a6 100644 --- a/llama.h +++ b/llama.h @@ -83,11 +83,12 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); struct llama_context_params { - uint32_t seed; // RNG seed, -1 for random - int32_t n_ctx; // text context - int32_t n_batch; // prompt processing batch size - int32_t n_gpu_layers; // number of layers to store in VRAM - int32_t main_gpu; // the GPU that is used for scratch and small tensors + uint32_t seed; // RNG seed, -1 for random + int32_t n_ctx; // text context + int32_t n_batch; // prompt processing batch size + int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams) + int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t main_gpu; // the GPU that is used for scratch and small tensors const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)