Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
83ca507cfd
25 changed files with 734 additions and 494 deletions
33
.devops/full-cuda.Dockerfile
Normal file
33
.devops/full-cuda.Dockerfile
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable cuBLAS
|
||||||
|
ENV LLAMA_CUBLAS=1
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
32
.devops/main-cuda.Dockerfile
Normal file
32
.devops/main-cuda.Dockerfile
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
# Target the CUDA runtime image
|
||||||
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable cuBLAS
|
||||||
|
ENV LLAMA_CUBLAS=1
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/main /main
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
12
.github/workflows/build.yml
vendored
12
.github/workflows/build.yml
vendored
|
@ -17,6 +17,9 @@ on:
|
||||||
|
|
||||||
env:
|
env:
|
||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_NITER: 1
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
ubuntu-focal-make:
|
ubuntu-focal-make:
|
||||||
|
@ -64,7 +67,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose
|
ctest --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-sanitizer:
|
ubuntu-latest-cmake-sanitizer:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -99,7 +102,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose
|
ctest --verbose --timeout 900
|
||||||
|
|
||||||
macOS-latest-make:
|
macOS-latest-make:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
@ -147,10 +150,11 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose
|
ctest --verbose --timeout 900
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
|
||||||
env:
|
env:
|
||||||
OPENBLAS_VERSION: 0.3.23
|
OPENBLAS_VERSION: 0.3.23
|
||||||
OPENCL_VERSION: 2023.04.17
|
OPENCL_VERSION: 2023.04.17
|
||||||
|
@ -249,7 +253,7 @@ jobs:
|
||||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
|
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -C Release --verbose
|
ctest -C Release --verbose --timeout 900
|
||||||
|
|
||||||
- name: Get commit hash
|
- name: Get commit hash
|
||||||
id: commit
|
id: commit
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -163,7 +163,12 @@ ifdef LLAMA_CUBLAS
|
||||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
NVCC = nvcc
|
NVCC = nvcc
|
||||||
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
|
NVCCFLAGS = --forward-unknown-to-host-compiler
|
||||||
|
ifdef CUDA_DOCKER_ARCH
|
||||||
|
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||||
|
else
|
||||||
|
NVCCFLAGS += -arch=native
|
||||||
|
endif # CUDA_DOCKER_ARCH
|
||||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
endif # LLAMA_CUDA_FORCE_DMMV
|
endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
|
@ -187,6 +192,7 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
endif # LLAMA_CUBLAS
|
endif # LLAMA_CUBLAS
|
||||||
|
|
34
README.md
34
README.md
|
@ -86,7 +86,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
|
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
|
||||||
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
|
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
|
||||||
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
|
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
|
||||||
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B)
|
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
|
||||||
|
@ -731,6 +731,38 @@ or with a light image:
|
||||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Docker With CUDA
|
||||||
|
|
||||||
|
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
|
||||||
|
|
||||||
|
#### Building Locally
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||||
|
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
|
||||||
|
```
|
||||||
|
|
||||||
|
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||||
|
|
||||||
|
The defaults are:
|
||||||
|
|
||||||
|
- `CUDA_VERSION` set to `11.7.1`
|
||||||
|
- `CUDA_DOCKER_ARCH` set to `all`
|
||||||
|
|
||||||
|
The resulting images, are essentially the same as the non-CUDA images:
|
||||||
|
|
||||||
|
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
|
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
|
||||||
|
|
||||||
|
#### Usage
|
||||||
|
|
||||||
|
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
|
```
|
||||||
|
|
||||||
### Contributing
|
### Contributing
|
||||||
|
|
||||||
- Contributors can open PRs
|
- Contributors can open PRs
|
||||||
|
|
|
@ -154,9 +154,15 @@ class Params:
|
||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
if "model.layers.0.self_attn.q_proj.weight" in model:
|
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||||
|
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
||||||
|
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
||||||
else:
|
else:
|
||||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
|
|
||||||
|
if n_layer < 1:
|
||||||
|
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
||||||
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||||
|
|
||||||
n_head=n_embd // 128 # guessed
|
n_head=n_embd // 128 # guessed
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
|
@ -822,6 +828,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
|
||||||
|
|
||||||
|
|
||||||
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
||||||
|
'BF16': DT_BF16,
|
||||||
'F16': DT_F16,
|
'F16': DT_F16,
|
||||||
'F32': DT_F32,
|
'F32': DT_F32,
|
||||||
'I32': DT_I32,
|
'I32': DT_I32,
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
cd `dirname $0`
|
cd `dirname $0`
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
./main -m ./models/ggml-alpaca-7b-q4.bin \
|
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
||||||
--color \
|
--color \
|
||||||
-f ./prompts/alpaca.txt \
|
-f ./prompts/alpaca.txt \
|
||||||
--ctx_size 2048 \
|
--ctx_size 2048 \
|
||||||
|
|
|
@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) {
|
||||||
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
|
if (plan.work_size > 0) {
|
||||||
|
buf.resize(plan.work_size);
|
||||||
|
plan.work_data = buf.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_graph_compute(graph, &plan);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * randomize_tensor(
|
struct ggml_tensor * randomize_tensor(
|
||||||
struct ggml_tensor * tensor,
|
struct ggml_tensor * tensor,
|
||||||
int ndims,
|
int ndims,
|
||||||
|
@ -1569,6 +1580,8 @@ int main(int argc, char ** argv) {
|
||||||
int n_tokens = model.hparams.n_ctx;
|
int n_tokens = model.hparams.n_ctx;
|
||||||
int n_vocab = model.hparams.n_vocab;
|
int n_vocab = model.hparams.n_vocab;
|
||||||
|
|
||||||
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
for (int ex=0; ex<n_examples; ++ex) {
|
for (int ex=0; ex<n_examples; ++ex) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ compute_size,
|
/*.mem_size =*/ compute_size,
|
||||||
|
@ -1586,7 +1599,6 @@ int main(int argc, char ** argv) {
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
ggml_cgraph gf = {};
|
ggml_cgraph gf = {};
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
||||||
|
|
||||||
|
@ -1595,7 +1607,7 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, e);
|
ggml_build_forward_expand(&gf, e);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
|
||||||
|
@ -1611,7 +1623,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_opt(ctx0, opt_params_lbfgs, e);
|
ggml_opt(ctx0, opt_params_lbfgs, e);
|
||||||
//
|
//
|
||||||
ggml_build_forward_expand(&gf, e);
|
ggml_build_forward_expand(&gf, e);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
float error_after_opt = ggml_get_f32_1d(e, 0);
|
float error_after_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
|
||||||
|
@ -1659,13 +1671,12 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
ggml_cgraph gf = {};
|
ggml_cgraph gf = {};
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, logits);
|
ggml_build_forward_expand(&gf, logits);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||||
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
||||||
|
@ -1687,10 +1698,11 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
print_matrix(model.tok_embeddings);
|
print_matrix(model.tok_embeddings);
|
||||||
|
|
||||||
printf("done\n");
|
printf("done\n");
|
||||||
|
|
||||||
// ggml_free(kv_self.ctx);
|
// ggml_free(kv_self.ctx);
|
||||||
// ggml_free(model_lora.ctx);
|
// ggml_free(model_lora.ctx);
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,17 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
|
if (plan.work_size > 0) {
|
||||||
|
buf.resize(plan.work_size);
|
||||||
|
plan.work_data = buf.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_graph_compute(graph, &plan);
|
||||||
|
}
|
||||||
|
|
||||||
float tensor_sum_elements(const ggml_tensor * tensor) {
|
float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
if (tensor->type==GGML_TYPE_F32) {
|
if (tensor->type==GGML_TYPE_F32) {
|
||||||
|
@ -159,13 +170,14 @@ int main(int argc, char ** argv) {
|
||||||
// printf("Creating compute graph\n");
|
// printf("Creating compute graph\n");
|
||||||
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
|
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
|
||||||
|
|
||||||
gf.n_threads=benchmark_params.n_threads;
|
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||||
printf("cgraph->n_threads=%i\n",gf.n_threads);
|
|
||||||
|
|
||||||
TENSOR_DUMP(m11);
|
TENSOR_DUMP(m11);
|
||||||
TENSOR_DUMP(m2);
|
TENSOR_DUMP(m2);
|
||||||
|
|
||||||
ggml_graph_compute(ctx, &gf);
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
|
ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
|
||||||
|
|
||||||
TENSOR_DUMP(gf.nodes[0]);
|
TENSOR_DUMP(gf.nodes[0]);
|
||||||
|
|
||||||
|
@ -187,7 +199,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// printf("Creating compute graph\n");
|
// printf("Creating compute graph\n");
|
||||||
struct ggml_cgraph gf31 = ggml_build_forward(q31);
|
struct ggml_cgraph gf31 = ggml_build_forward(q31);
|
||||||
gf31.n_threads=benchmark_params.n_threads;
|
|
||||||
|
|
||||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||||
// printf("Creating new tensor q12 & Running quantize\n");
|
// printf("Creating new tensor q12 & Running quantize\n");
|
||||||
|
@ -199,8 +210,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
//printf("Creating compute graph\n");
|
//printf("Creating compute graph\n");
|
||||||
struct ggml_cgraph gf32 = ggml_build_forward(q32);
|
struct ggml_cgraph gf32 = ggml_build_forward(q32);
|
||||||
gf32.n_threads=benchmark_params.n_threads;
|
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||||
printf("cgraph->n_threads=%i\n",gf31.n_threads);
|
|
||||||
|
|
||||||
const int dimx = sizex;
|
const int dimx = sizex;
|
||||||
const int dimy = sizey;
|
const int dimy = sizey;
|
||||||
|
@ -221,14 +231,15 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
long long int start = ggml_time_us();
|
long long int start = ggml_time_us();
|
||||||
//printf("Running ggml_graph_compute\n");
|
//printf("Running ggml_graph_compute\n");
|
||||||
ggml_graph_compute(ctx, &gf31);
|
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
|
||||||
|
|
||||||
long long int stop = ggml_time_us();
|
long long int stop = ggml_time_us();
|
||||||
long long int usec = stop-start;
|
long long int usec = stop-start;
|
||||||
double gflops = (double)(flops_per_matrix)/usec/1000.0;
|
double gflops = (double)(flops_per_matrix)/usec/1000.0;
|
||||||
gflops_sum += gflops;
|
gflops_sum += gflops;
|
||||||
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
|
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
|
||||||
i,
|
i,
|
||||||
gf31.n_threads,
|
benchmark_params.n_threads,
|
||||||
sizex, sizey, sizez, flops_per_matrix,
|
sizex, sizey, sizez, flops_per_matrix,
|
||||||
usec,gflops);
|
usec,gflops);
|
||||||
|
|
||||||
|
@ -253,7 +264,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Running a different graph computation to make sure we override the CPU cache lines
|
// Running a different graph computation to make sure we override the CPU cache lines
|
||||||
ggml_graph_compute(ctx, &gf32);
|
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
||||||
|
|
|
@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
"expect poor results\n", __func__, params.n_ctx);
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
"expect poor results\n", __func__, params.n_ctx);
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
} else if (params.n_ctx < 8) {
|
} else if (params.n_ctx < 8) {
|
||||||
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||||
|
|
|
@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_context * ctx_eval = NULL;
|
struct ggml_context * ctx_eval = NULL;
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
auto * ctx_metal = ggml_metal_init();
|
auto * ctx_metal = ggml_metal_init(1);
|
||||||
|
|
||||||
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
|
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
|
||||||
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
|
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
|
||||||
|
|
|
@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
|
||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
"expect poor results\n", __func__, params.n_ctx);
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ Command line options:
|
||||||
- `--threads N`, `-t N`: Set the number of threads to use during computation.
|
- `--threads N`, `-t N`: Set the number of threads to use during computation.
|
||||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||||
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||||
|
|
|
@ -60,6 +60,17 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
|
||||||
return rnd->rd(rnd->gen);
|
return rnd->rd(rnd->gen);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
|
if (plan.work_size > 0) {
|
||||||
|
buf.resize(plan.work_size);
|
||||||
|
plan.work_data = buf.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_graph_compute(graph, &plan);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
|
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
|
||||||
float scale = 1.0f; // xavier
|
float scale = 1.0f; // xavier
|
||||||
switch (tensor->n_dims) {
|
switch (tensor->n_dims) {
|
||||||
|
@ -1426,11 +1437,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
|
|
||||||
gf->n_nodes = 0;
|
gf->n_nodes = 0;
|
||||||
gf->n_leafs = 0;
|
gf->n_leafs = 0;
|
||||||
gf->work_size = 0;
|
|
||||||
gf->perf_runs = 0;
|
gf->perf_runs = 0;
|
||||||
gf->perf_cycles = 0;
|
gf->perf_cycles = 0;
|
||||||
gf->perf_time_us = 0;
|
gf->perf_time_us = 0;
|
||||||
gf->work = NULL;
|
|
||||||
|
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
//const int n_ctx = hparams.n_ctx;
|
//const int n_ctx = hparams.n_ctx;
|
||||||
|
@ -3162,6 +3171,7 @@ int main(int argc, char ** argv) {
|
||||||
printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
|
printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
|
||||||
// ggml_print_tensor_objects(model.ctx);
|
// ggml_print_tensor_objects(model.ctx);
|
||||||
|
|
||||||
|
// TODO: use std::vector<uint8_t> intead of "new"
|
||||||
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
|
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
|
||||||
uint8_t * compute_addr = new uint8_t[compute_size];
|
uint8_t * compute_addr = new uint8_t[compute_size];
|
||||||
|
|
||||||
|
@ -3183,6 +3193,8 @@ int main(int argc, char ** argv) {
|
||||||
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
|
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
printf("%s: begin training\n", __func__);
|
printf("%s: begin training\n", __func__);
|
||||||
|
|
||||||
for (int ex = 0; ex < params.n_examples; ++ex) {
|
for (int ex = 0; ex < params.n_examples; ++ex) {
|
||||||
|
@ -3217,9 +3229,6 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
||||||
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
||||||
|
|
||||||
// ggml_cgraph gf = {};
|
|
||||||
gf->n_threads = params.n_threads;
|
|
||||||
gb->n_threads = params.n_threads;
|
|
||||||
|
|
||||||
get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs);
|
get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs);
|
||||||
|
|
||||||
|
@ -3248,7 +3257,7 @@ int main(int argc, char ** argv) {
|
||||||
*gb = ggml_build_backward(ctx0, gf, true);
|
*gb = ggml_build_backward(ctx0, gf, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, gf);
|
ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
|
||||||
|
|
||||||
size_t used_mem_before_opt = ggml_used_mem(ctx0);
|
size_t used_mem_before_opt = ggml_used_mem(ctx0);
|
||||||
|
|
||||||
|
@ -3272,7 +3281,7 @@ int main(int argc, char ** argv) {
|
||||||
model.train_samples += n_batch;
|
model.train_samples += n_batch;
|
||||||
model.train_tokens += n_batch * n_tokens;
|
model.train_tokens += n_batch * n_tokens;
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, gf);
|
ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
|
||||||
|
|
||||||
float error_after_opt = ggml_get_f32_1d(loss, 0);
|
float error_after_opt = ggml_get_f32_1d(loss, 0);
|
||||||
|
|
||||||
|
@ -3354,13 +3363,12 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_context * ctx0 = ggml_init(cparams);
|
struct ggml_context * ctx0 = ggml_init(cparams);
|
||||||
|
|
||||||
ggml_cgraph gf = {};
|
ggml_cgraph gf = {};
|
||||||
gf.n_threads = params.n_threads;
|
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, logits);
|
ggml_build_forward_expand(&gf, logits);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(work_buffer, &gf, params.n_threads);
|
||||||
|
|
||||||
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||||
//struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
//struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
||||||
|
@ -3386,6 +3394,7 @@ int main(int argc, char ** argv) {
|
||||||
delete[] compute_addr;
|
delete[] compute_addr;
|
||||||
delete[] compute_buf_0;
|
delete[] compute_buf_0;
|
||||||
delete[] compute_buf_1;
|
delete[] compute_buf_1;
|
||||||
|
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(lmodel);
|
llama_free_model(lmodel);
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
|
|
53
ggml-cuda.cu
53
ggml-cuda.cu
|
@ -59,8 +59,8 @@ typedef float2 dfloat2;
|
||||||
#endif //GGML_CUDA_DMMV_F16
|
#endif //GGML_CUDA_DMMV_F16
|
||||||
|
|
||||||
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
||||||
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
||||||
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
||||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||||
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||||
typedef void (*ggml_cuda_op_t)(
|
typedef void (*ggml_cuda_op_t)(
|
||||||
|
@ -131,7 +131,7 @@ typedef struct {
|
||||||
} block_q8_1;
|
} block_q8_1;
|
||||||
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
||||||
|
|
||||||
typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
|
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
||||||
|
|
||||||
//================================= k-quants
|
//================================= k-quants
|
||||||
|
|
||||||
|
@ -407,7 +407,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
||||||
|
|
||||||
//================================== k-quants
|
//================================== k-quants
|
||||||
|
|
||||||
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
const block_q2_K * x = (const block_q2_K *) vx;
|
const block_q2_K * x = (const block_q2_K *) vx;
|
||||||
|
@ -440,7 +440,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
const block_q3_K * x = (const block_q3_K *) vx;
|
const block_q3_K * x = (const block_q3_K *) vx;
|
||||||
|
@ -504,7 +504,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
||||||
const block_q4_K * x = (const block_q4_K *) vx;
|
const block_q4_K * x = (const block_q4_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
@ -544,7 +544,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
||||||
const block_q5_K * x = (const block_q5_K *) vx;
|
const block_q5_K * x = (const block_q5_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
@ -590,7 +590,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
||||||
const block_q6_K * x = (const block_q6_K *) vx;
|
const block_q6_K * x = (const block_q6_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
@ -634,7 +634,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||||
|
|
||||||
|
@ -742,7 +742,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row > nrows) return;
|
||||||
|
@ -846,7 +846,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row > nrows) return;
|
||||||
|
@ -949,7 +949,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
|
||||||
|
|
||||||
const int row = blockIdx.x;
|
const int row = blockIdx.x;
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
|
@ -1053,7 +1053,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||||
|
|
||||||
|
@ -1171,7 +1171,7 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
||||||
v.y = x[ib + iqs + 1];
|
v.y = x[ib + iqs + 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int k) {
|
||||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (i >= k) {
|
if (i >= k) {
|
||||||
|
@ -1207,7 +1207,7 @@ static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||||
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
|
||||||
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
||||||
|
|
||||||
if (i >= k) {
|
if (i >= k) {
|
||||||
|
@ -1227,7 +1227,7 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
||||||
y[iybs + iqs + y_offset] = v.y;
|
y[iybs + iqs + y_offset] = v.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||||
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
||||||
|
|
||||||
|
@ -1252,7 +1252,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
|
||||||
#endif // __CUDA_ARCH__ >= 600
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||||
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
||||||
|
|
||||||
|
@ -1277,7 +1277,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
|
||||||
#endif // __CUDA_ARCH__ >= 600
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||||
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
||||||
|
|
||||||
|
@ -1312,7 +1312,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
|
||||||
#endif // __CUDA_ARCH__ >= 600
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||||
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
||||||
|
|
||||||
|
@ -1346,7 +1346,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
|
||||||
#endif // __CUDA_ARCH__ >= 600
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||||
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
||||||
|
|
||||||
|
@ -1366,7 +1366,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, cons
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||||
static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
if (row >= nrows) {
|
if (row >= nrows) {
|
||||||
|
@ -1404,7 +1404,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||||
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||||
// qk = quantized weights per x block
|
// qk = quantized weights per x block
|
||||||
// qr = number of quantized weights per data value in x block
|
// qr = number of quantized weights per data value in x block
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
|
@ -1471,7 +1471,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
||||||
const half * x = (const half *) vx;
|
const half * x = (const half *) vx;
|
||||||
|
|
||||||
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
@ -1518,7 +1518,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||||
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
||||||
const int row_stride_x, const int channel_stride_x) {
|
const int row_stride_x, const int channel_stride_x) {
|
||||||
|
|
||||||
const half * x = (const half *) vx;
|
const half * x = (const half *) vx;
|
||||||
|
@ -2355,10 +2355,7 @@ inline void ggml_cuda_op_mul_mat_vec(
|
||||||
src0->type == GGML_TYPE_Q5_1 ||
|
src0->type == GGML_TYPE_Q5_1 ||
|
||||||
src0->type == GGML_TYPE_Q8_0;
|
src0->type == GGML_TYPE_Q8_0;
|
||||||
|
|
||||||
// The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
|
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented;
|
||||||
// However, they have bad performance with Pascal cards.
|
|
||||||
// Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
|
|
||||||
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (use_mul_mat_vec_q) {
|
if (use_mul_mat_vec_q) {
|
||||||
|
|
|
@ -34,9 +34,13 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_metal_context;
|
struct ggml_metal_context;
|
||||||
|
|
||||||
struct ggml_metal_context * ggml_metal_init(void);
|
// number of command buffers to use
|
||||||
|
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
||||||
void ggml_metal_free(struct ggml_metal_context * ctx);
|
void ggml_metal_free(struct ggml_metal_context * ctx);
|
||||||
|
|
||||||
|
// set the number of command buffers to use
|
||||||
|
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
||||||
|
|
||||||
// creates a mapping between a host memory buffer and a device memory buffer
|
// creates a mapping between a host memory buffer and a device memory buffer
|
||||||
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
||||||
// - the mapping is used during computation to determine the arguments of the compute kernels
|
// - the mapping is used during computation to determine the arguments of the compute kernels
|
||||||
|
|
11
ggml-metal.m
11
ggml-metal.m
|
@ -25,6 +25,8 @@ struct ggml_metal_buffer {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_metal_context {
|
struct ggml_metal_context {
|
||||||
|
int n_cb;
|
||||||
|
|
||||||
float * logits;
|
float * logits;
|
||||||
|
|
||||||
id<MTLDevice> device;
|
id<MTLDevice> device;
|
||||||
|
@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal";
|
||||||
@implementation GGMLMetalClass
|
@implementation GGMLMetalClass
|
||||||
@end
|
@end
|
||||||
|
|
||||||
struct ggml_metal_context * ggml_metal_init(void) {
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
fprintf(stderr, "%s: allocating\n", __func__);
|
fprintf(stderr, "%s: allocating\n", __func__);
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||||
|
|
||||||
|
ctx->n_cb = n_cb;
|
||||||
ctx->device = MTLCreateSystemDefaultDevice();
|
ctx->device = MTLCreateSystemDefaultDevice();
|
||||||
ctx->queue = [ctx->device newCommandQueue];
|
ctx->queue = [ctx->device newCommandQueue];
|
||||||
ctx->n_buffers = 0;
|
ctx->n_buffers = 0;
|
||||||
|
@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
||||||
|
ctx->n_cb = n_cb;
|
||||||
|
}
|
||||||
|
|
||||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
|
@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
|
||||||
// create multiple command buffers and enqueue them
|
// create multiple command buffers and enqueue them
|
||||||
// then, we encode the graph into the command buffers in parallel
|
// then, we encode the graph into the command buffers in parallel
|
||||||
|
|
||||||
const int n_cb = gf->n_threads;
|
const int n_cb = ctx->n_cb;
|
||||||
|
|
||||||
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
||||||
|
|
||||||
|
|
|
@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||||
const int in = tid - step*im; // 0...15 or 0...7
|
const int in = tid - step*im; // 0...15 or 0...7
|
||||||
|
|
||||||
#if K_QUANTS_PER_ITERATION == 1
|
\n#if K_QUANTS_PER_ITERATION == 1\n
|
||||||
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
||||||
const int is = 0;
|
const int is = 0;
|
||||||
#else
|
|
||||||
|
\n#else\n
|
||||||
|
|
||||||
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
||||||
const int is = in / 4;
|
const int is = in / 4;
|
||||||
#endif
|
|
||||||
|
\n#endif\n
|
||||||
|
|
||||||
const int ql_offset = 64*im + l0;
|
const int ql_offset = 64*im + l0;
|
||||||
const int qh_offset = 32*im + l0;
|
const int qh_offset = 32*im + l0;
|
||||||
const int s_offset = 8*im + is;
|
const int s_offset = 8*im + is;
|
||||||
|
@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||||
|
|
||||||
const float d = vload_half(0, &x[i].d);
|
const float d = vload_half(0, &x[i].d);
|
||||||
|
|
||||||
#if K_QUANTS_PER_ITERATION == 1
|
\n#if K_QUANTS_PER_ITERATION == 1\n
|
||||||
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
||||||
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
||||||
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
||||||
|
@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||||
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
||||||
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
||||||
tmp[16 * ix + tid] += sum;
|
tmp[16 * ix + tid] += sum;
|
||||||
#else
|
\n#else\n
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
for (int l = 0; l < 4; ++l) {
|
for (int l = 0; l < 4; ++l) {
|
||||||
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
||||||
|
@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||||
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
||||||
}
|
}
|
||||||
tmp[16 * ix + tid] += sum;
|
tmp[16 * ix + tid] += sum;
|
||||||
#endif
|
\n#endif\n
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
230
ggml.c
230
ggml.c
|
@ -4583,14 +4583,13 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
/*.src0 =*/ NULL,
|
/*.src0 =*/ NULL,
|
||||||
/*.src1 =*/ NULL,
|
/*.src1 =*/ NULL,
|
||||||
/*.opt =*/ { NULL },
|
/*.opt =*/ { NULL },
|
||||||
/*.n_tasks =*/ 0,
|
|
||||||
/*.perf_runs =*/ 0,
|
/*.perf_runs =*/ 0,
|
||||||
/*.perf_cycles =*/ 0,
|
/*.perf_cycles =*/ 0,
|
||||||
/*.perf_time_us =*/ 0,
|
/*.perf_time_us =*/ 0,
|
||||||
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
||||||
/*.name =*/ { 0 },
|
/*.name =*/ { 0 },
|
||||||
/*.extra =*/ NULL,
|
/*.extra =*/ NULL,
|
||||||
/*.pad =*/ { 0 },
|
/*.padding =*/ { 0 },
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
||||||
|
@ -10718,8 +10717,6 @@ static void ggml_compute_forward_mul_mat(
|
||||||
|
|
||||||
float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
|
float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
|
||||||
|
|
||||||
assert(ne00 % 32 == 0);
|
|
||||||
|
|
||||||
for (int64_t ic = 0; ic < ne11; ++ic) {
|
for (int64_t ic = 0; ic < ne11; ++ic) {
|
||||||
vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
|
vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
|
||||||
}
|
}
|
||||||
|
@ -15772,9 +15769,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
||||||
struct ggml_cgraph result = {
|
struct ggml_cgraph result = {
|
||||||
/*.n_nodes =*/ 0,
|
/*.n_nodes =*/ 0,
|
||||||
/*.n_leafs =*/ 0,
|
/*.n_leafs =*/ 0,
|
||||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
|
|
||||||
/*.work_size =*/ 0,
|
|
||||||
/*.work =*/ NULL,
|
|
||||||
/*.nodes =*/ { NULL },
|
/*.nodes =*/ { NULL },
|
||||||
/*.grads =*/ { NULL },
|
/*.grads =*/ { NULL },
|
||||||
/*.leafs =*/ { NULL },
|
/*.leafs =*/ { NULL },
|
||||||
|
@ -15945,12 +15939,13 @@ void clear_numa_thread_affinity(void) {}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct ggml_compute_state_shared {
|
struct ggml_compute_state_shared {
|
||||||
struct ggml_cgraph * cgraph;
|
const struct ggml_cgraph * cgraph;
|
||||||
|
const struct ggml_cplan * cplan;
|
||||||
|
|
||||||
int64_t perf_node_start_cycles;
|
int64_t perf_node_start_cycles;
|
||||||
int64_t perf_node_start_time_us;
|
int64_t perf_node_start_time_us;
|
||||||
|
|
||||||
int n_threads;
|
const int n_threads;
|
||||||
|
|
||||||
// synchronization primitives
|
// synchronization primitives
|
||||||
atomic_int n_active; // num active threads
|
atomic_int n_active; // num active threads
|
||||||
|
@ -15974,9 +15969,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
||||||
|
|
||||||
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
||||||
struct ggml_cgraph * cgraph = state->shared->cgraph;
|
|
||||||
|
|
||||||
|
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
||||||
|
const struct ggml_cplan * cplan = state->shared->cplan;
|
||||||
|
|
||||||
|
const int * n_tasks_arr = cplan->n_tasks;
|
||||||
const int n_threads = state->shared->n_threads;
|
const int n_threads = state->shared->n_threads;
|
||||||
|
|
||||||
set_numa_thread_affinity(state->ith, n_threads);
|
set_numa_thread_affinity(state->ith, n_threads);
|
||||||
|
|
||||||
int node_n = -1;
|
int node_n = -1;
|
||||||
|
@ -15989,15 +15988,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
/*.type =*/ GGML_TASK_FINALIZE,
|
/*.type =*/ GGML_TASK_FINALIZE,
|
||||||
/*.ith =*/ 0,
|
/*.ith =*/ 0,
|
||||||
/*.nth =*/ 0,
|
/*.nth =*/ 0,
|
||||||
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
/*.wsize =*/ cplan->work_size,
|
||||||
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
/*.wdata =*/ cplan->work_data,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (node_n != -1) {
|
if (node_n != -1) {
|
||||||
/* FINALIZE */
|
/* FINALIZE */
|
||||||
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
||||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||||
params.nth = node->n_tasks;
|
params.nth = n_tasks_arr[node_n];
|
||||||
ggml_compute_forward(¶ms, node);
|
ggml_compute_forward(¶ms, node);
|
||||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||||
}
|
}
|
||||||
|
@ -16008,11 +16007,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
||||||
|
|
||||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||||
|
const int n_tasks = n_tasks_arr[node_n];
|
||||||
|
|
||||||
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
||||||
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
||||||
|
|
||||||
params.nth = node->n_tasks;
|
params.nth = n_tasks;
|
||||||
|
|
||||||
/* INIT */
|
/* INIT */
|
||||||
if (GGML_OP_HAS_INIT[node->op]) {
|
if (GGML_OP_HAS_INIT[node->op]) {
|
||||||
|
@ -16020,7 +16020,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
ggml_compute_forward(¶ms, node);
|
ggml_compute_forward(¶ms, node);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->n_tasks == 1) {
|
if (n_tasks == 1) {
|
||||||
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
||||||
// they do something more efficient than spinning (?)
|
// they do something more efficient than spinning (?)
|
||||||
params.type = GGML_TASK_COMPUTE;
|
params.type = GGML_TASK_COMPUTE;
|
||||||
|
@ -16042,7 +16042,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
// wait for other threads to finish
|
// wait for other threads to finish
|
||||||
const int last = node_n;
|
const int last = node_n;
|
||||||
do {
|
do {
|
||||||
sched_yield();
|
//sched_yield();
|
||||||
node_n = atomic_load(&state->shared->node_n);
|
node_n = atomic_load(&state->shared->node_n);
|
||||||
} while (node_n == last);
|
} while (node_n == last);
|
||||||
}
|
}
|
||||||
|
@ -16052,16 +16052,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
|
|
||||||
/* COMPUTE */
|
/* COMPUTE */
|
||||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||||
|
const int n_tasks = n_tasks_arr[node_n];
|
||||||
|
|
||||||
struct ggml_compute_params params = {
|
struct ggml_compute_params params = {
|
||||||
/*.type =*/ GGML_TASK_COMPUTE,
|
/*.type =*/ GGML_TASK_COMPUTE,
|
||||||
/*.ith =*/ state->ith,
|
/*.ith =*/ state->ith,
|
||||||
/*.nth =*/ node->n_tasks,
|
/*.nth =*/ n_tasks,
|
||||||
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
/*.wsize =*/ cplan->work_size,
|
||||||
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
/*.wdata =*/ cplan->work_data,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (state->ith < node->n_tasks) {
|
if (state->ith < n_tasks) {
|
||||||
ggml_compute_forward(¶ms, node);
|
ggml_compute_forward(¶ms, node);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16069,36 +16070,31 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
const int n_threads = cgraph->n_threads;
|
if (n_threads <= 0) {
|
||||||
|
n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_compute_state_shared state_shared = {
|
|
||||||
/*.cgraph =*/ cgraph,
|
|
||||||
/*.perf_node_start_cycles =*/ 0,
|
|
||||||
/*.perf_node_start_time_us =*/ 0,
|
|
||||||
/*.n_threads =*/ n_threads,
|
|
||||||
/*.n_active =*/ n_threads,
|
|
||||||
/*.node_n =*/ -1,
|
|
||||||
};
|
|
||||||
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
|
||||||
|
|
||||||
// initialize tasks + work buffer
|
|
||||||
{
|
|
||||||
size_t work_size = 0;
|
size_t work_size = 0;
|
||||||
|
|
||||||
// thread scheduling for the different operations
|
struct ggml_cplan cplan;
|
||||||
|
memset(&cplan, 0, sizeof(struct ggml_cplan));
|
||||||
|
|
||||||
|
// thread scheduling for the different operations + work buffer size estimation
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
int n_tasks = 1;
|
||||||
|
|
||||||
struct ggml_tensor * node = cgraph->nodes[i];
|
struct ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
switch (node->op) {
|
switch (node->op) {
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
if (ggml_is_quantized(node->type)) {
|
if (ggml_is_quantized(node->type)) {
|
||||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
|
@ -16106,24 +16102,24 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
case GGML_OP_ADD1:
|
case GGML_OP_ADD1:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
if (ggml_is_quantized(node->src0->type)) {
|
if (ggml_is_quantized(node->src0->type)) {
|
||||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
if (ggml_is_quantized(node->src0->type)) {
|
if (ggml_is_quantized(node->src0->type)) {
|
||||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
|
@ -16147,7 +16143,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
case GGML_OP_ELU:
|
case GGML_OP_ELU:
|
||||||
case GGML_OP_RELU:
|
case GGML_OP_RELU:
|
||||||
{
|
{
|
||||||
node->n_tasks = 1;
|
n_tasks = 1;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
case GGML_OP_GELU:
|
case GGML_OP_GELU:
|
||||||
|
@ -16158,40 +16154,38 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
case GGML_OP_RMS_NORM_BACK:
|
case GGML_OP_RMS_NORM_BACK:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
case GGML_OP_OUT_PROD:
|
case GGML_OP_OUT_PROD:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
||||||
// TODO: use different scheduling for different matrix sizes
|
// TODO: use different scheduling for different matrix sizes
|
||||||
//const int nr0 = ggml_nrows(node->src0);
|
//const int nr0 = ggml_nrows(node->src0);
|
||||||
//const int nr1 = ggml_nrows(node->src1);
|
//const int nr1 = ggml_nrows(node->src1);
|
||||||
|
|
||||||
//node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
||||||
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
|
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
|
const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
||||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
n_tasks = 1; // TODO: this actually is doing nothing
|
||||||
// the threads are still spinning
|
// the threads are still spinning
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
|
if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
|
||||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
n_tasks = 1; // TODO: this actually is doing nothing
|
||||||
// the threads are still spinning
|
// the threads are still spinning
|
||||||
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
|
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
|
||||||
}
|
} else
|
||||||
else
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
n_tasks = 1; // TODO: this actually is doing nothing
|
||||||
// the threads are still spinning
|
// the threads are still spinning
|
||||||
if (node->src0->type != GGML_TYPE_F32) {
|
if (node->src0->type != GGML_TYPE_F32) {
|
||||||
// here we need memory just for single 2D matrix from src0
|
// here we need memory just for single 2D matrix from src0
|
||||||
|
@ -16209,7 +16203,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
{
|
{
|
||||||
node->n_tasks = 1;
|
n_tasks = 1;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SET:
|
case GGML_OP_SET:
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
|
@ -16222,7 +16216,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
case GGML_OP_DIAG:
|
case GGML_OP_DIAG:
|
||||||
case GGML_OP_DIAG_MASK_ZERO:
|
case GGML_OP_DIAG_MASK_ZERO:
|
||||||
{
|
{
|
||||||
node->n_tasks = 1;
|
n_tasks = 1;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
|
@ -16230,19 +16224,19 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
case GGML_OP_ROPE_BACK:
|
case GGML_OP_ROPE_BACK:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ALIBI:
|
case GGML_OP_ALIBI:
|
||||||
{
|
{
|
||||||
node->n_tasks = 1; //TODO
|
n_tasks = 1; //TODO
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
{
|
{
|
||||||
node->n_tasks = 1; //TODO
|
n_tasks = 1; //TODO
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_1D:
|
case GGML_OP_CONV_1D:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
||||||
GGML_ASSERT(node->src0->ne[3] == 1);
|
GGML_ASSERT(node->src0->ne[3] == 1);
|
||||||
GGML_ASSERT(node->src1->ne[2] == 1);
|
GGML_ASSERT(node->src1->ne[2] == 1);
|
||||||
|
@ -16271,7 +16265,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_2D:
|
case GGML_OP_CONV_2D:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
||||||
GGML_ASSERT(node->src1->ne[3] == 1);
|
GGML_ASSERT(node->src1->ne[3] == 1);
|
||||||
|
|
||||||
|
@ -16306,45 +16300,45 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_FLASH_ATTN:
|
case GGML_OP_FLASH_ATTN:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||||
|
|
||||||
if (node->src1->type == GGML_TYPE_F32) {
|
if (node->src1->type == GGML_TYPE_F32) {
|
||||||
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
||||||
cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
|
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->src1->type == GGML_TYPE_F16) {
|
if (node->src1->type == GGML_TYPE_F16) {
|
||||||
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
||||||
cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
|
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
||||||
}
|
}
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_FLASH_FF:
|
case GGML_OP_FLASH_FF:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
if (node->src1->type == GGML_TYPE_F32) {
|
if (node->src1->type == GGML_TYPE_F32) {
|
||||||
cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
|
cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||||
cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
|
cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->src1->type == GGML_TYPE_F16) {
|
if (node->src1->type == GGML_TYPE_F16) {
|
||||||
cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
|
cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||||
cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
|
cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
|
||||||
}
|
}
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_FLASH_ATTN_BACK:
|
case GGML_OP_FLASH_ATTN_BACK:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
|
@ -16352,13 +16346,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||||
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
||||||
if (node->src1->type == GGML_TYPE_F32) {
|
if (node->src1->type == GGML_TYPE_F32) {
|
||||||
cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
|
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
||||||
cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
|
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->src1->type == GGML_TYPE_F16) {
|
if (node->src1->type == GGML_TYPE_F16) {
|
||||||
cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
|
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
|
||||||
cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
|
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
|
||||||
}
|
}
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
|
@ -16371,46 +16365,76 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
case GGML_OP_MAP_CUSTOM2:
|
case GGML_OP_MAP_CUSTOM2:
|
||||||
case GGML_OP_MAP_CUSTOM3:
|
case GGML_OP_MAP_CUSTOM3:
|
||||||
{
|
{
|
||||||
node->n_tasks = 1;
|
n_tasks = 1;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
||||||
size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
|
size_t cur = ggml_type_size(node->type)*(n_tasks + node->src0->ne[0]*n_tasks);
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||||
{
|
{
|
||||||
node->n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
|
|
||||||
size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
|
size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks;
|
||||||
|
|
||||||
work_size = MAX(work_size, cur);
|
work_size = MAX(work_size, cur);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_NONE:
|
case GGML_OP_NONE:
|
||||||
{
|
{
|
||||||
node->n_tasks = 1;
|
n_tasks = 1;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_COUNT:
|
case GGML_OP_COUNT:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cplan.n_tasks[i] = n_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cgraph->work != NULL && work_size > cgraph->work_size) {
|
if (work_size > 0) {
|
||||||
GGML_ASSERT(false); // TODO: better handling
|
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (work_size > 0 && cgraph->work == NULL) {
|
cplan.n_threads = n_threads;
|
||||||
cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);
|
cplan.work_size = work_size;
|
||||||
|
cplan.work_data = NULL;
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
|
return cplan;
|
||||||
cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
|
}
|
||||||
|
|
||||||
|
void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
|
{
|
||||||
|
GGML_ASSERT(cplan);
|
||||||
|
GGML_ASSERT(cplan->n_threads > 0);
|
||||||
|
|
||||||
|
if (cplan->work_size > 0) {
|
||||||
|
GGML_ASSERT(cplan->work_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||||
|
if (cgraph->nodes[i]->op != GGML_OP_NONE) {
|
||||||
|
GGML_ASSERT(cplan->n_tasks[i] > 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n_threads = cplan->n_threads;
|
||||||
|
|
||||||
|
struct ggml_compute_state_shared state_shared = {
|
||||||
|
/*.cgraph =*/ cgraph,
|
||||||
|
/*.cgraph_plan =*/ cplan,
|
||||||
|
/*.perf_node_start_cycles =*/ 0,
|
||||||
|
/*.perf_node_start_time_us =*/ 0,
|
||||||
|
/*.n_threads =*/ n_threads,
|
||||||
|
/*.n_active =*/ n_threads,
|
||||||
|
/*.node_n =*/ -1,
|
||||||
|
};
|
||||||
|
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
||||||
|
|
||||||
// create thread pool
|
// create thread pool
|
||||||
if (n_threads > 1) {
|
if (n_threads > 1) {
|
||||||
|
@ -16473,6 +16497,17 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
|
||||||
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
|
||||||
|
|
||||||
|
struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
|
||||||
|
GGML_ASSERT(buf);
|
||||||
|
|
||||||
|
cplan.work_data = buf->data;
|
||||||
|
|
||||||
|
ggml_graph_compute(cgraph, &cplan);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
|
||||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = cgraph->leafs[i];
|
struct ggml_tensor * leaf = cgraph->leafs[i];
|
||||||
|
@ -16511,14 +16546,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
||||||
const int64_t * ne = tensor->ne;
|
const int64_t * ne = tensor->ne;
|
||||||
const size_t * nb = tensor->nb;
|
const size_t * nb = tensor->nb;
|
||||||
|
|
||||||
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
||||||
arg,
|
arg,
|
||||||
ggml_type_name(tensor->type),
|
ggml_type_name(tensor->type),
|
||||||
ggml_op_name (tensor->op),
|
ggml_op_name (tensor->op),
|
||||||
tensor->n_dims,
|
tensor->n_dims,
|
||||||
ne[0], ne[1], ne[2], ne[3],
|
ne[0], ne[1], ne[2], ne[3],
|
||||||
nb[0], nb[1], nb[2], nb[3],
|
nb[0], nb[1], nb[2], nb[3],
|
||||||
tensor->n_tasks,
|
|
||||||
tensor->data,
|
tensor->data,
|
||||||
tensor->name);
|
tensor->name);
|
||||||
}
|
}
|
||||||
|
@ -17254,9 +17288,6 @@ static enum ggml_opt_result ggml_opt_adam(
|
||||||
struct ggml_cgraph * gb) {
|
struct ggml_cgraph * gb) {
|
||||||
GGML_ASSERT(ggml_is_scalar(f));
|
GGML_ASSERT(ggml_is_scalar(f));
|
||||||
|
|
||||||
gf->n_threads = params.n_threads;
|
|
||||||
gb->n_threads = params.n_threads;
|
|
||||||
|
|
||||||
// these will store the parameters we want to optimize
|
// these will store the parameters we want to optimize
|
||||||
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
||||||
|
|
||||||
|
@ -17303,7 +17334,8 @@ static enum ggml_opt_result ggml_opt_adam(
|
||||||
// compute the function value
|
// compute the function value
|
||||||
ggml_graph_reset (gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(ctx, gb);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
||||||
|
|
||||||
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
||||||
opt->adam.fx_best = opt->adam.fx_prev;
|
opt->adam.fx_best = opt->adam.fx_prev;
|
||||||
|
@ -17383,7 +17415,8 @@ static enum ggml_opt_result ggml_opt_adam(
|
||||||
|
|
||||||
ggml_graph_reset (gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(ctx, gb);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
||||||
|
|
||||||
const float fx = ggml_get_f32_1d(f, 0);
|
const float fx = ggml_get_f32_1d(f, 0);
|
||||||
|
|
||||||
|
@ -17505,7 +17538,8 @@ static enum ggml_opt_result linesearch_backtracking(
|
||||||
|
|
||||||
ggml_graph_reset (gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(ctx, gb);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
|
||||||
|
|
||||||
ggml_opt_get_grad(np, ps, g);
|
ggml_opt_get_grad(np, ps, g);
|
||||||
|
|
||||||
|
@ -17573,9 +17607,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
gf->n_threads = params.n_threads;
|
|
||||||
gb->n_threads = params.n_threads;
|
|
||||||
|
|
||||||
const int m = params.lbfgs.m;
|
const int m = params.lbfgs.m;
|
||||||
|
|
||||||
// these will store the parameters we want to optimize
|
// these will store the parameters we want to optimize
|
||||||
|
@ -17627,7 +17658,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
||||||
|
|
||||||
ggml_graph_reset (gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(ctx, gb);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
|
||||||
|
|
||||||
ggml_opt_get_grad(np, ps, g);
|
ggml_opt_get_grad(np, ps, g);
|
||||||
|
|
||||||
|
|
44
ggml.h
44
ggml.h
|
@ -65,7 +65,7 @@
|
||||||
// ggml_set_f32(a, 3.0f);
|
// ggml_set_f32(a, 3.0f);
|
||||||
// ggml_set_f32(b, 4.0f);
|
// ggml_set_f32(b, 4.0f);
|
||||||
//
|
//
|
||||||
// ggml_graph_compute(ctx0, &gf);
|
// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
|
||||||
//
|
//
|
||||||
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
||||||
//
|
//
|
||||||
|
@ -418,9 +418,6 @@ extern "C" {
|
||||||
struct ggml_tensor * src1;
|
struct ggml_tensor * src1;
|
||||||
struct ggml_tensor * opt[GGML_MAX_OPT];
|
struct ggml_tensor * opt[GGML_MAX_OPT];
|
||||||
|
|
||||||
// thread scheduling
|
|
||||||
int n_tasks;
|
|
||||||
|
|
||||||
// performance
|
// performance
|
||||||
int perf_runs;
|
int perf_runs;
|
||||||
int64_t perf_cycles;
|
int64_t perf_cycles;
|
||||||
|
@ -432,19 +429,27 @@ extern "C" {
|
||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[4];
|
char padding[8];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
|
||||||
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||||
|
// since https://github.com/ggerganov/ggml/issues/287
|
||||||
|
struct ggml_cplan {
|
||||||
|
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||||
|
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||||
|
|
||||||
|
int n_threads;
|
||||||
|
|
||||||
|
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
||||||
|
int n_tasks[GGML_MAX_NODES];
|
||||||
|
};
|
||||||
|
|
||||||
// computation graph
|
// computation graph
|
||||||
struct ggml_cgraph {
|
struct ggml_cgraph {
|
||||||
int n_nodes;
|
int n_nodes;
|
||||||
int n_leafs;
|
int n_leafs;
|
||||||
int n_threads;
|
|
||||||
|
|
||||||
size_t work_size;
|
|
||||||
struct ggml_tensor * work;
|
|
||||||
|
|
||||||
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
||||||
struct ggml_tensor * grads[GGML_MAX_NODES];
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
||||||
|
@ -1297,9 +1302,16 @@ extern "C" {
|
||||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||||
|
|
||||||
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
|
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||||
|
GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||||
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||||
|
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||||
|
|
||||||
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
||||||
|
@ -1514,9 +1526,15 @@ extern "C" {
|
||||||
// Internal types and functions exposed for tests and benchmarks
|
// Internal types and functions exposed for tests and benchmarks
|
||||||
//
|
//
|
||||||
|
|
||||||
typedef void (*ggml_to_float_t)(const void * x, float * y, int k);
|
#ifdef __cplusplus
|
||||||
typedef void (*ggml_from_float_t)(const float * x, void * y, int k);
|
// restrict not standard in C++
|
||||||
typedef void (*ggml_vec_dot_t)(const int n, float * s, const void * x, const void * y);
|
#define GGML_RESTRICT
|
||||||
|
#else
|
||||||
|
#define GGML_RESTRICT restrict
|
||||||
|
#endif
|
||||||
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||||
|
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_to_float_t to_float;
|
ggml_to_float_t to_float;
|
||||||
|
|
50
llama.cpp
50
llama.cpp
|
@ -79,6 +79,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
||||||
(void) tensor;
|
(void) tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// ggml helpers
|
||||||
|
//
|
||||||
|
|
||||||
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
|
if (plan.work_size > 0) {
|
||||||
|
buf.resize(plan.work_size);
|
||||||
|
plan.work_data = buf.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_graph_compute(graph, &plan);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// memory sizes
|
||||||
|
//
|
||||||
|
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
|
@ -321,6 +340,9 @@ struct llama_context {
|
||||||
// input embedding (1-dimensional array: [n_embd])
|
// input embedding (1-dimensional array: [n_embd])
|
||||||
std::vector<float> embedding;
|
std::vector<float> embedding;
|
||||||
|
|
||||||
|
// reusable buffer for `struct ggml_graph_plan.work_data`
|
||||||
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
// memory buffers used to evaluate the model
|
// memory buffers used to evaluate the model
|
||||||
// TODO: move in llama_state
|
// TODO: move in llama_state
|
||||||
llama_ctx_buffer buf_compute;
|
llama_ctx_buffer buf_compute;
|
||||||
|
@ -758,7 +780,6 @@ struct llama_model_loader {
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// kv cache
|
// kv cache
|
||||||
//
|
//
|
||||||
|
@ -1265,7 +1286,7 @@ static bool llama_eval_internal(
|
||||||
const float * embd,
|
const float * embd,
|
||||||
const int n_tokens,
|
const int n_tokens,
|
||||||
const int n_past,
|
const int n_past,
|
||||||
const int n_threads,
|
int n_threads,
|
||||||
const char * cgraph_fname) {
|
const char * cgraph_fname) {
|
||||||
|
|
||||||
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
||||||
|
@ -1306,10 +1327,11 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
|
ggml_cgraph gf = {};
|
||||||
|
|
||||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||||
ggml_cgraph gf = {};
|
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -1593,6 +1615,7 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (lctx.ctx_metal && N == 1) {
|
if (lctx.ctx_metal && N == 1) {
|
||||||
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||||
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
||||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
||||||
} else {
|
} else {
|
||||||
|
@ -1612,10 +1635,10 @@ static bool llama_eval_internal(
|
||||||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (cgraph_fname) {
|
if (cgraph_fname) {
|
||||||
|
@ -2645,7 +2668,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (params.n_gpu_layers > 0) {
|
if (params.n_gpu_layers > 0) {
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
ctx->ctx_metal = ggml_metal_init();
|
ctx->ctx_metal = ggml_metal_init(1);
|
||||||
|
|
||||||
void * data_ptr = NULL;
|
void * data_ptr = NULL;
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
|
@ -2802,6 +2825,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
// read tensors and apply
|
// read tensors and apply
|
||||||
bool warned = false;
|
bool warned = false;
|
||||||
int n_tensors = 0;
|
int n_tensors = 0;
|
||||||
|
|
||||||
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
int32_t n_dims;
|
int32_t n_dims;
|
||||||
int32_t length;
|
int32_t length;
|
||||||
|
@ -2966,8 +2992,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||||
gf.n_threads = n_threads;
|
|
||||||
ggml_graph_compute(lora_ctx, &gf);
|
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
||||||
|
|
||||||
// we won't need these tensors again, reset the context to save memory
|
// we won't need these tensors again, reset the context to save memory
|
||||||
ggml_free(lora_ctx);
|
ggml_free(lora_ctx);
|
||||||
|
@ -3120,7 +3146,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
|
|
||||||
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||||
ggml_cgraph gf{};
|
ggml_cgraph gf{};
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||||
kout3d->data = out;
|
kout3d->data = out;
|
||||||
|
@ -3140,7 +3165,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||||
ggml_graph_compute(cpy_ctx, &gf);
|
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
}
|
}
|
||||||
|
@ -3226,7 +3251,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
|
|
||||||
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||||
ggml_cgraph gf{};
|
ggml_cgraph gf{};
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||||
kin3d->data = (void *) inp;
|
kin3d->data = (void *) inp;
|
||||||
|
@ -3246,7 +3270,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||||
ggml_graph_compute(cpy_ctx, &gf);
|
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
|
||||||
llama_add_test(test-quantize-perf.cpp)
|
llama_add_test(test-quantize-perf.cpp)
|
||||||
llama_add_test(test-sampling.cpp)
|
llama_add_test(test-sampling.cpp)
|
||||||
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
||||||
# llama_add_test(test-grad0.c) # SLOW
|
llama_add_test(test-grad0.c) # SLOW
|
||||||
# llama_add_test(test-opt.c) # SLOW
|
# llama_add_test(test-opt.c) # SLOW
|
||||||
|
|
|
@ -10,6 +10,8 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||||
|
|
||||||
#define MAX_NARGS 3
|
#define MAX_NARGS 3
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
|
@ -49,7 +51,7 @@ float frand(void) {
|
||||||
|
|
||||||
int irand(int n) {
|
int irand(int n) {
|
||||||
if (n == 0) return 0;
|
if (n == 0) return 0;
|
||||||
else return rand()%n;
|
return rand()%n;
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_random_dims(int64_t * dims, int ndims) {
|
void get_random_dims(int64_t * dims, int ndims) {
|
||||||
|
@ -159,13 +161,15 @@ struct ggml_tensor * get_random_tensor_int(
|
||||||
float get_element(const struct ggml_tensor * t, int idx) {
|
float get_element(const struct ggml_tensor * t, int idx) {
|
||||||
if (t->type == GGML_TYPE_F32) {
|
if (t->type == GGML_TYPE_F32) {
|
||||||
return ((float *)t->data)[idx];
|
return ((float *)t->data)[idx];
|
||||||
} else if (t->type == GGML_TYPE_I32) {
|
}
|
||||||
|
|
||||||
|
if (t->type == GGML_TYPE_I32) {
|
||||||
return ((int32_t *)t->data)[idx];
|
return ((int32_t *)t->data)[idx];
|
||||||
} else {
|
}
|
||||||
|
|
||||||
assert(false);
|
assert(false);
|
||||||
return INFINITY;
|
return INFINITY;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
void set_element(struct ggml_tensor * t, int idx, float value) {
|
void set_element(struct ggml_tensor * t, int idx, float value) {
|
||||||
((float *)t->data)[idx] = value;
|
((float *)t->data)[idx] = value;
|
||||||
|
@ -215,15 +219,14 @@ bool check_gradient(
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward (f);
|
struct ggml_cgraph gf = ggml_build_forward (f);
|
||||||
gf.n_threads = n_threads;
|
|
||||||
|
|
||||||
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
||||||
gb.n_threads = n_threads;
|
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||||
|
|
||||||
ggml_graph_reset (&gf);
|
ggml_graph_reset (&gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(ctx0, &gb);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
|
||||||
|
|
||||||
// ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
|
// ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
|
||||||
// ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot");
|
// ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot");
|
||||||
|
@ -236,15 +239,16 @@ bool check_gradient(
|
||||||
const float xm = x0 - eps;
|
const float xm = x0 - eps;
|
||||||
const float xp = x0 + eps;
|
const float xp = x0 + eps;
|
||||||
set_element(x[i], k, xp);
|
set_element(x[i], k, xp);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||||
|
|
||||||
const float f0 = ggml_get_f32_1d(f, 0);
|
const float f0 = ggml_get_f32_1d(f, 0);
|
||||||
|
|
||||||
set_element(x[i], k, xm);
|
set_element(x[i], k, xm);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||||
|
|
||||||
const float f1 = ggml_get_f32_1d(f, 0);
|
const float f1 = ggml_get_f32_1d(f, 0);
|
||||||
|
|
||||||
const float g0 = (f0 - f1)/(2.0f*eps);
|
const float g0 = (f0 - f1)/(2.0f*eps);
|
||||||
|
|
||||||
set_element(x[i], k, x0);
|
set_element(x[i], k, x0);
|
||||||
|
@ -252,12 +256,13 @@ bool check_gradient(
|
||||||
// compute gradient using backward graph
|
// compute gradient using backward graph
|
||||||
ggml_graph_reset (&gf);
|
ggml_graph_reset (&gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(ctx0, &gb);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
|
||||||
|
|
||||||
const float g1 = get_element(x[i]->grad, k);
|
const float g1 = get_element(x[i]->grad, k);
|
||||||
|
|
||||||
const float error_abs = fabsf(g0 - g1);
|
const float error_abs = fabsf(g0 - g1);
|
||||||
const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0;
|
const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
|
||||||
|
|
||||||
if (error_abs > max_error_abs || error_rel > max_error_rel) {
|
if (error_abs > max_error_abs || error_rel > max_error_rel) {
|
||||||
printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
|
printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
#define MAX_NARGS 2
|
#define MAX_NARGS 2
|
||||||
|
|
||||||
|
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||||
|
|
||||||
//
|
//
|
||||||
// logging
|
// logging
|
||||||
|
@ -33,7 +34,7 @@
|
||||||
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
||||||
|
|
||||||
|
|
||||||
float frand() {
|
float frand(void) {
|
||||||
return (float)rand()/(float)RAND_MAX;
|
return (float)rand()/(float)RAND_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,7 +115,7 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
|
||||||
((float *)t->data)[idx] = value;
|
((float *)t->data)[idx] = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, const char ** argv) {
|
int main(void) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
.mem_size = 1024*1024*1024,
|
.mem_size = 1024*1024*1024,
|
||||||
.mem_buffer = NULL,
|
.mem_buffer = NULL,
|
||||||
|
@ -137,10 +138,11 @@ int main(int argc, const char ** argv) {
|
||||||
struct ggml_tensor * d = ggml_sub(ctx, c, ab);
|
struct ggml_tensor * d = ggml_sub(ctx, c, ab);
|
||||||
struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d));
|
struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d));
|
||||||
|
|
||||||
|
|
||||||
struct ggml_cgraph ge = ggml_build_forward(e);
|
struct ggml_cgraph ge = ggml_build_forward(e);
|
||||||
ggml_graph_reset(&ge);
|
ggml_graph_reset(&ge);
|
||||||
ggml_graph_compute(ctx, &ge);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
|
||||||
|
|
||||||
const float fe = ggml_get_f32_1d(e, 0);
|
const float fe = ggml_get_f32_1d(e, 0);
|
||||||
printf("%s: e = %.4f\n", __func__, fe);
|
printf("%s: e = %.4f\n", __func__, fe);
|
||||||
|
|
||||||
|
@ -149,7 +151,9 @@ int main(int argc, const char ** argv) {
|
||||||
ggml_opt(ctx, opt_params, e);
|
ggml_opt(ctx, opt_params, e);
|
||||||
|
|
||||||
ggml_graph_reset(&ge);
|
ggml_graph_reset(&ge);
|
||||||
ggml_graph_compute(ctx, &ge);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
|
||||||
|
|
||||||
const float fe_opt = ggml_get_f32_1d(e, 0);
|
const float fe_opt = ggml_get_f32_1d(e, 0);
|
||||||
printf("%s: original e = %.4f\n", __func__, fe);
|
printf("%s: original e = %.4f\n", __func__, fe);
|
||||||
printf("%s: optimized e = %.4f\n", __func__, fe_opt);
|
printf("%s: optimized e = %.4f\n", __func__, fe_opt);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue