Merge branch 'master' into gg/cmake-warnings
This commit is contained in:
commit
c254ee763a
20 changed files with 1012 additions and 385 deletions
15
.github/labeler.yml
vendored
15
.github/labeler.yml
vendored
|
@ -3,19 +3,18 @@ Kompute:
|
|||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-kompute.h
|
||||
- ggml/src/ggml-kompute.cpp
|
||||
- ggml/src/ggml-kompute/**
|
||||
- README-kompute.md
|
||||
Apple Metal:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-metal.h
|
||||
- ggml/src/ggml-metal.cpp
|
||||
- ggml/src/ggml-metal/**
|
||||
- README-metal.md
|
||||
SYCL:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-sycl.h
|
||||
- ggml/src/ggml-sycl.cpp
|
||||
- ggml/src/ggml-sycl/**
|
||||
- docs/backend/SYCL.md
|
||||
- examples/sycl/**
|
||||
|
@ -27,8 +26,8 @@ Nvidia GPU:
|
|||
Vulkan:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/ggml_vk_generate_shaders.py
|
||||
- ggml/src/ggml-vulkan*
|
||||
- ggml/include/ggml-vulkan.h
|
||||
- ggml/src/ggml-vulkan/**
|
||||
documentation:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
@ -75,11 +74,7 @@ server:
|
|||
ggml:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml*.h
|
||||
- ggml/src/ggml*.c
|
||||
- ggml/src/ggml*.cpp
|
||||
- ggml/src/ggml*.h
|
||||
- ggml-cuda/**
|
||||
- ggml/**
|
||||
nix:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
|
59
.github/workflows/build.yml
vendored
59
.github/workflows/build.yml
vendored
|
@ -871,8 +871,65 @@ jobs:
|
|||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
ubuntu-latest-cmake-cuda:
|
||||
runs-on: ubuntu-latest
|
||||
container: nvidia/cuda:12.6.2-devel-ubuntu24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install dependencies
|
||||
env:
|
||||
DEBIAN_FRONTEND: noninteractive
|
||||
run: |
|
||||
apt update
|
||||
apt install -y cmake build-essential ninja-build libgomp1 git
|
||||
|
||||
- name: Build with CMake
|
||||
run: |
|
||||
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
|
||||
cmake --build build
|
||||
|
||||
windows-latest-cmake-cuda:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.6.2']
|
||||
build: ['cuda']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install CUDA toolkit
|
||||
id: cuda-toolkit
|
||||
uses: Jimver/cuda-toolkit@v0.2.19
|
||||
with:
|
||||
cuda: ${{ matrix.cuda }}
|
||||
method: 'network'
|
||||
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON -DCMAKE_CUDA_ARCHITECTURES=89-real
|
||||
cmake --build build --config Release -t ggml-cuda
|
||||
cmake --build build --config Release
|
||||
|
||||
windows-2019-cmake-cuda:
|
||||
runs-on: windows-2019
|
||||
if: ${{ github.event == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
|
@ -1173,7 +1230,7 @@ jobs:
|
|||
- macOS-latest-make
|
||||
- macOS-latest-cmake
|
||||
- windows-latest-cmake
|
||||
- windows-latest-cmake-cuda
|
||||
- windows-2019-cmake-cuda
|
||||
- windows-latest-cmake-hip-release
|
||||
- macOS-latest-cmake-arm64
|
||||
- macOS-latest-cmake-x64
|
||||
|
|
13
.github/workflows/docker.yml
vendored
13
.github/workflows/docker.yml
vendored
|
@ -10,12 +10,10 @@
|
|||
name: Publish Docker image
|
||||
|
||||
on:
|
||||
#pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||
workflow_dispatch: # allows manual triggering, useful for debugging
|
||||
workflow_dispatch: # allows manual triggering
|
||||
schedule:
|
||||
# Rebuild daily rather than on every push because it is expensive
|
||||
- cron: '12 4 * * *'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
|
@ -29,7 +27,6 @@ permissions:
|
|||
jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
#if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
|
@ -117,7 +114,7 @@ jobs:
|
|||
swap-storage: true
|
||||
|
||||
- name: Build and push Docker image (tagged + versioned)
|
||||
if: github.event_name == 'push'
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
|
|
2
.github/workflows/nix-ci.yml
vendored
2
.github/workflows/nix-ci.yml
vendored
|
@ -5,8 +5,10 @@ on:
|
|||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
|
|
9
.github/workflows/python-lint.yml
vendored
9
.github/workflows/python-lint.yml
vendored
|
@ -1,6 +1,13 @@
|
|||
name: flake8 Lint
|
||||
|
||||
on: [push, pull_request]
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/python-lint.yml', '**/*.py']
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
|
|
|
@ -164,8 +164,11 @@ if (GGML_TARGET_DEFINES)
|
|||
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
|
||||
endif()
|
||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
||||
# all public headers
|
||||
set(LLAMA_PUBLIC_HEADERS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
|
||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||
|
||||
configure_package_config_file(
|
||||
|
|
6
Makefile
6
Makefile
|
@ -34,6 +34,7 @@ BUILD_TARGETS = \
|
|||
llama-server \
|
||||
llama-simple \
|
||||
llama-simple-chat \
|
||||
llama-run \
|
||||
llama-speculative \
|
||||
llama-tokenize \
|
||||
llama-vdot \
|
||||
|
@ -1167,6 +1168,11 @@ llama-infill: examples/infill/infill.cpp \
|
|||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-run: examples/run/run.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-simple: examples/simple/simple.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
|
|
|
@ -90,9 +90,10 @@ bool common_speculative_are_compatible(
|
|||
if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
|
||||
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
|
||||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
||||
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
|
||||
) {
|
||||
llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
|
||||
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
||||
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
|
||||
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -40,6 +40,7 @@ else()
|
|||
add_subdirectory(server)
|
||||
endif()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(run)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
|
|
5
examples/run/CMakeLists.txt
Normal file
5
examples/run/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
set(TARGET llama-run)
|
||||
add_executable(${TARGET} run.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
7
examples/run/README.md
Normal file
7
examples/run/README.md
Normal file
|
@ -0,0 +1,7 @@
|
|||
# llama.cpp/example/run
|
||||
|
||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
|
||||
|
||||
```bash
|
||||
./llama-run Meta-Llama-3.1-8B-Instruct.gguf
|
||||
...
|
409
examples/run/run.cpp
Normal file
409
examples/run/run.cpp
Normal file
|
@ -0,0 +1,409 @@
|
|||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <climits>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "llama-cpp.h"
|
||||
|
||||
typedef std::unique_ptr<char[]> char_array_ptr;
|
||||
|
||||
struct Argument {
|
||||
std::string flag;
|
||||
std::string help_text;
|
||||
};
|
||||
|
||||
struct Options {
|
||||
std::string model_path, prompt_non_interactive;
|
||||
int ngl = 99;
|
||||
int n_ctx = 2048;
|
||||
};
|
||||
|
||||
class ArgumentParser {
|
||||
public:
|
||||
ArgumentParser(const char * program_name) : program_name(program_name) {}
|
||||
|
||||
void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
|
||||
string_args[flag] = &var;
|
||||
arguments.push_back({flag, help_text});
|
||||
}
|
||||
|
||||
void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
|
||||
int_args[flag] = &var;
|
||||
arguments.push_back({flag, help_text});
|
||||
}
|
||||
|
||||
int parse(int argc, const char ** argv) {
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
std::string arg = argv[i];
|
||||
if (string_args.count(arg)) {
|
||||
if (i + 1 < argc) {
|
||||
*string_args[arg] = argv[++i];
|
||||
} else {
|
||||
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
} else if (int_args.count(arg)) {
|
||||
if (i + 1 < argc) {
|
||||
if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
|
||||
fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (string_args["-m"]->empty()) {
|
||||
fprintf(stderr, "error: -m is required\n");
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
const char * program_name;
|
||||
std::unordered_map<std::string, std::string *> string_args;
|
||||
std::unordered_map<std::string, int *> int_args;
|
||||
std::vector<Argument> arguments;
|
||||
|
||||
int parse_int_arg(const char * arg, int & value) {
|
||||
char * end;
|
||||
const long val = std::strtol(arg, &end, 10);
|
||||
if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
|
||||
value = static_cast<int>(val);
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
void print_usage() const {
|
||||
printf("\nUsage:\n");
|
||||
printf(" %s [OPTIONS]\n\n", program_name);
|
||||
printf("Options:\n");
|
||||
for (const auto & arg : arguments) {
|
||||
printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
};
|
||||
|
||||
class LlamaData {
|
||||
public:
|
||||
llama_model_ptr model;
|
||||
llama_sampler_ptr sampler;
|
||||
llama_context_ptr context;
|
||||
std::vector<llama_chat_message> messages;
|
||||
|
||||
int init(const Options & opt) {
|
||||
model = initialize_model(opt.model_path, opt.ngl);
|
||||
if (!model) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
context = initialize_context(model, opt.n_ctx);
|
||||
if (!context) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
sampler = initialize_sampler();
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
// Initializes the model and returns a unique pointer to it
|
||||
llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = ngl;
|
||||
|
||||
llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
|
||||
if (!model) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
}
|
||||
|
||||
return model;
|
||||
}
|
||||
|
||||
// Initializes the context with the specified parameters
|
||||
llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = n_ctx;
|
||||
ctx_params.n_batch = n_ctx;
|
||||
|
||||
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
|
||||
if (!context) {
|
||||
fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
|
||||
}
|
||||
|
||||
return context;
|
||||
}
|
||||
|
||||
// Initializes and configures the sampler
|
||||
llama_sampler_ptr initialize_sampler() {
|
||||
llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
|
||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
|
||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
|
||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
||||
|
||||
return sampler;
|
||||
}
|
||||
};
|
||||
|
||||
// Add a message to `messages` and store its content in `owned_content`
|
||||
static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
|
||||
std::vector<char_array_ptr> & owned_content) {
|
||||
char_array_ptr content(new char[text.size() + 1]);
|
||||
std::strcpy(content.get(), text.c_str());
|
||||
llama_data.messages.push_back({role, content.get()});
|
||||
owned_content.push_back(std::move(content));
|
||||
}
|
||||
|
||||
// Function to apply the chat template and resize `formatted` if needed
|
||||
static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
|
||||
int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
|
||||
llama_data.messages.size(), append, formatted.data(), formatted.size());
|
||||
if (result > static_cast<int>(formatted.size())) {
|
||||
formatted.resize(result);
|
||||
result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
|
||||
llama_data.messages.size(), append, formatted.data(), formatted.size());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Function to tokenize the prompt
|
||||
static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
|
||||
std::vector<llama_token> & prompt_tokens) {
|
||||
const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
||||
prompt_tokens.resize(n_prompt_tokens);
|
||||
if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
|
||||
true) < 0) {
|
||||
GGML_ABORT("failed to tokenize the prompt\n");
|
||||
}
|
||||
|
||||
return n_prompt_tokens;
|
||||
}
|
||||
|
||||
// Check if we have enough space in the context to evaluate this batch
|
||||
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
|
||||
const int n_ctx = llama_n_ctx(ctx.get());
|
||||
const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// convert the token to a string
|
||||
static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
|
||||
char buf[256];
|
||||
int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
|
||||
if (n < 0) {
|
||||
GGML_ABORT("failed to convert token to piece\n");
|
||||
}
|
||||
|
||||
piece = std::string(buf, n);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) {
|
||||
printf("%s", piece.c_str());
|
||||
fflush(stdout);
|
||||
response += piece;
|
||||
}
|
||||
|
||||
// helper function to evaluate a prompt and generate a response
|
||||
static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
|
||||
if (n_prompt_tokens < 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// prepare a batch for the prompt
|
||||
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
||||
llama_token new_token_id;
|
||||
while (true) {
|
||||
check_context_size(llama_data.context, batch);
|
||||
if (llama_decode(llama_data.context.get(), batch)) {
|
||||
GGML_ABORT("failed to decode\n");
|
||||
}
|
||||
|
||||
// sample the next token, check is it an end of generation?
|
||||
new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
|
||||
if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
|
||||
break;
|
||||
}
|
||||
|
||||
std::string piece;
|
||||
if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
print_word_and_concatenate_to_response(piece, response);
|
||||
|
||||
// prepare the next batch with the sampled token
|
||||
batch = llama_batch_get_one(&new_token_id, 1);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parse_arguments(const int argc, const char ** argv, Options & opt) {
|
||||
ArgumentParser parser(argv[0]);
|
||||
parser.add_argument("-m", opt.model_path, "model");
|
||||
parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
|
||||
parser.add_argument("-c", opt.n_ctx, "context_size");
|
||||
parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
|
||||
if (parser.parse(argc, argv)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_user_input(std::string & user) {
|
||||
std::getline(std::cin, user);
|
||||
return user.empty(); // Indicate an error or empty input
|
||||
}
|
||||
|
||||
// Function to generate a response based on the prompt
|
||||
static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
|
||||
// Set response color
|
||||
printf("\033[33m");
|
||||
if (generate(llama_data, prompt, response)) {
|
||||
fprintf(stderr, "failed to generate response\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// End response with color reset and newline
|
||||
printf("\n\033[0m");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Helper function to apply the chat template and handle errors
|
||||
static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
|
||||
const bool is_user_input, int & output_length) {
|
||||
const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
|
||||
if (new_len < 0) {
|
||||
fprintf(stderr, "failed to apply the chat template\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
output_length = new_len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Helper function to handle user input
|
||||
static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
|
||||
if (!prompt_non_interactive.empty()) {
|
||||
user_input = prompt_non_interactive;
|
||||
return true; // No need for interactive input
|
||||
}
|
||||
|
||||
printf("\033[32m> \033[0m");
|
||||
return !read_user_input(user_input); // Returns false if input ends the loop
|
||||
}
|
||||
|
||||
// Function to tokenize the prompt
|
||||
static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
|
||||
std::vector<char_array_ptr> owned_content;
|
||||
std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
|
||||
int prev_len = 0;
|
||||
|
||||
while (true) {
|
||||
// Get user input
|
||||
std::string user_input;
|
||||
if (!handle_user_input(user_input, prompt_non_interactive)) {
|
||||
break;
|
||||
}
|
||||
|
||||
add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
|
||||
owned_content);
|
||||
|
||||
int new_len;
|
||||
if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
|
||||
std::string response;
|
||||
if (generate_response(llama_data, prompt, response)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void log_callback(const enum ggml_log_level level, const char * text, void *) {
|
||||
if (level == GGML_LOG_LEVEL_ERROR) {
|
||||
fprintf(stderr, "%s", text);
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_stdin_a_terminal() {
|
||||
#if defined(_WIN32)
|
||||
HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
|
||||
DWORD mode;
|
||||
return GetConsoleMode(hStdin, &mode);
|
||||
#else
|
||||
return isatty(STDIN_FILENO);
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::string read_pipe_data() {
|
||||
std::ostringstream result;
|
||||
result << std::cin.rdbuf(); // Read all data from std::cin
|
||||
return result.str();
|
||||
}
|
||||
|
||||
int main(int argc, const char ** argv) {
|
||||
Options opt;
|
||||
if (parse_arguments(argc, argv, opt)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!is_stdin_a_terminal()) {
|
||||
if (!opt.prompt_non_interactive.empty()) {
|
||||
opt.prompt_non_interactive += "\n\n";
|
||||
}
|
||||
|
||||
opt.prompt_non_interactive += read_pipe_data();
|
||||
}
|
||||
|
||||
llama_log_set(log_callback, nullptr);
|
||||
LlamaData llama_data;
|
||||
if (llama_data.init(opt)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (chat_loop(llama_data, opt.prompt_non_interactive)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -2267,49 +2267,48 @@ struct server_context {
|
|||
continue; // continue loop of slots
|
||||
}
|
||||
|
||||
llama_token id;
|
||||
llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
|
||||
|
||||
{
|
||||
completion_token_output result;
|
||||
slot.i_batch = -1;
|
||||
|
||||
id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
|
||||
common_sampler_accept(slot.smpl, id, true);
|
||||
|
||||
slot.i_batch = -1;
|
||||
|
||||
common_sampler_accept(slot.smpl, id, true);
|
||||
|
||||
slot.n_decoded += 1;
|
||||
if (slot.n_decoded == 1) {
|
||||
slot.t_start_generation = ggml_time_us();
|
||||
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
|
||||
metrics.on_prompt_eval(slot);
|
||||
}
|
||||
|
||||
result.tok = id;
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.smpl);
|
||||
|
||||
for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p->data[i].id,
|
||||
i >= cur_p->size ? 0.0f : cur_p->data[i].p,
|
||||
});
|
||||
}
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
// release slot because of stop condition
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
metrics.on_prediction(slot);
|
||||
continue;
|
||||
}
|
||||
slot.n_decoded += 1;
|
||||
if (slot.n_decoded == 1) {
|
||||
slot.t_start_generation = ggml_time_us();
|
||||
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
|
||||
metrics.on_prompt_eval(slot);
|
||||
}
|
||||
|
||||
// check if the slot supports speculative decoding
|
||||
if (!slot.can_speculate()) {
|
||||
completion_token_output result;
|
||||
result.tok = id;
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.smpl);
|
||||
|
||||
for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p->data[i].id,
|
||||
i >= cur_p->size ? 0.0f : cur_p->data[i].p,
|
||||
});
|
||||
}
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
// release slot because of stop condition
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
metrics.on_prediction(slot);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// do speculative decoding
|
||||
for (auto & slot : slots) {
|
||||
if (!slot.is_processing() || !slot.can_speculate()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
llama_token id = slot.sampled;
|
||||
|
||||
struct common_speculative_params params_spec;
|
||||
params_spec.n_draft = slot.params.speculative.n_max;
|
||||
|
|
|
@ -117,7 +117,8 @@ int main(int argc, char ** argv) {
|
|||
llama_token id_last = inp.back();
|
||||
|
||||
// all tokens currently in the target context
|
||||
auto prompt_tgt = std::vector<llama_token>(inp.begin(), inp.end() - 1);
|
||||
llama_tokens prompt_tgt(inp.begin(), inp.end() - 1);
|
||||
prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
|
||||
|
||||
int n_past = inp.size() - 1;
|
||||
|
||||
|
@ -181,54 +182,44 @@ int main(int argc, char ** argv) {
|
|||
GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
|
||||
|
||||
n_past += ids.size() - 1;
|
||||
n_drafted += batch_tgt.n_tokens - 1;
|
||||
n_drafted += draft.size(); // note: we ignore the discarded small drafts
|
||||
n_accept += ids.size() - 1;
|
||||
n_predict += ids.size();
|
||||
|
||||
// process the accepted tokens and update contexts
|
||||
//
|
||||
// this is the standard token post-processing that we normally do
|
||||
// in this case, we do it for a group of accepted tokens at once
|
||||
//
|
||||
{
|
||||
llama_token id = 0;
|
||||
std::string token_str;
|
||||
for (size_t i = 0; i < ids.size(); ++i) {
|
||||
prompt_tgt.push_back(id_last);
|
||||
|
||||
for (size_t i = 0; i < ids.size(); ++i) {
|
||||
id = ids[i];
|
||||
id_last = ids[i];
|
||||
|
||||
++n_predict;
|
||||
|
||||
if (llama_token_is_eog(model_tgt, id)) {
|
||||
has_eos = true;
|
||||
break;
|
||||
}
|
||||
|
||||
token_str = common_token_to_piece(ctx_tgt, id);
|
||||
|
||||
if (params.use_color && i + 1 < ids.size()) {
|
||||
LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
|
||||
} else {
|
||||
LOG("%s", token_str.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||
if (llama_token_is_eog(model_tgt, id_last)) {
|
||||
has_eos = true;
|
||||
break;
|
||||
}
|
||||
|
||||
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str());
|
||||
const std::string token_str = common_token_to_piece(ctx_tgt, id_last);
|
||||
|
||||
{
|
||||
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
|
||||
if (params.use_color && i + 1 < ids.size()) {
|
||||
LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
|
||||
} else {
|
||||
LOG("%s", token_str.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
prompt_tgt.push_back(id_last);
|
||||
prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1);
|
||||
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
|
||||
|
||||
// remember the last accepted token for the next iteration
|
||||
id_last = id;
|
||||
{
|
||||
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
|
||||
}
|
||||
|
||||
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
*/
|
||||
|
||||
#include "aclnn_ops.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <aclnnop/aclnn_avgpool2d.h>
|
||||
#include <aclnnop/aclnn_cast.h>
|
||||
|
@ -32,6 +33,8 @@
|
|||
#include <aclnnop/aclnn_group_norm.h>
|
||||
#include <aclnnop/aclnn_index_fill_tensor.h>
|
||||
#include <aclnnop/aclnn_layer_norm.h>
|
||||
#include <aclnnop/aclnn_mm.h>
|
||||
#include <aclnnop/aclnn_batch_matmul.h>
|
||||
#include <aclnnop/aclnn_matmul.h>
|
||||
#include <aclnnop/aclnn_max_pool.h>
|
||||
#include <aclnnop/aclnn_permute.h>
|
||||
|
@ -241,10 +244,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
int64_t concat_dim = 1;
|
||||
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
GGML_ASSERT(dim >= 0 && dim < 4);
|
||||
int32_t acl_dim = 3 - dim;
|
||||
|
||||
aclTensor* tensors[] = {acl_src0, acl_src1};
|
||||
aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
|
||||
aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
|
||||
aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
|
||||
|
||||
ACL_CHECK(aclDestroyTensorList(tensorList));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
|
@ -1437,10 +1444,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
ggml_tensor* src0 = dst->src[0]; // kernel
|
||||
ggml_tensor* src1 = dst->src[1]; // input
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
|
||||
|
@ -1462,9 +1465,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
const int64_t OH = is_2D ? ne2 : 1;
|
||||
const int64_t OW = ne1;
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
// memory allocated increased to 3x when is_2D == false
|
||||
const int64_t n_bytes_factor = is_2D ? 1 : 3;
|
||||
|
||||
|
@ -2425,7 +2425,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
|||
aclTensor* acl_weight, aclTensor* acl_dst) {
|
||||
int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
|
||||
// fp32, atlas a2 will transpose it to HFLOAT32.
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
@ -2443,6 +2442,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
|||
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs matrix multiplication of two 2D tensors.
|
||||
*
|
||||
* This function computes the matrix multiplication of the input tensor
|
||||
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
|
||||
* destination tensor `acl_dst`.
|
||||
* The operation is defined as:
|
||||
* \f[
|
||||
* \text {acl_dst}=\text {acl_input@acl_weight}
|
||||
* \f]
|
||||
*
|
||||
* @param ctx The context for the CANN backend operations.
|
||||
* @param acl_input The input tensor for the matrix multiplication.
|
||||
* @param acl_weight The weight tensor for the matrix multiplication.
|
||||
* @param acl_dst The destination tensor where the result of the matrix
|
||||
* multiplication will be stored.
|
||||
*/
|
||||
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||
aclTensor* acl_weight, aclTensor* acl_dst) {
|
||||
int8_t cube_math_type = 2;
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
|
||||
cube_math_type, &workspaceSize,
|
||||
&executor));
|
||||
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(
|
||||
aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs matrix multiplication of two 3D tensors.
|
||||
*
|
||||
* This function computes the matrix multiplication of the input tensor
|
||||
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
|
||||
* destination tensor `acl_dst`.
|
||||
* The operation is defined as:
|
||||
* \f[
|
||||
* \text {acl_dst}=\text {acl_input@acl_weight}
|
||||
* \f]
|
||||
*
|
||||
* @param ctx The context for the CANN backend operations.
|
||||
* @param acl_input The input tensor for the matrix multiplication.
|
||||
* @param acl_weight The weight tensor for the matrix multiplication.
|
||||
* @param acl_dst The destination tensor where the result of the matrix
|
||||
* multiplication will be stored.
|
||||
*/
|
||||
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||
aclTensor* acl_weight, aclTensor* acl_dst) {
|
||||
int8_t cube_math_type = 2;
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
|
||||
cube_math_type, &workspaceSize,
|
||||
&executor));
|
||||
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(
|
||||
aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs matrix multiplication with floating-point precision on
|
||||
* tensors using the CANN backend.
|
||||
|
@ -2464,20 +2537,43 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|||
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
|
||||
BCAST_MUL_MAT_SHAPE(input, weight, dst);
|
||||
|
||||
// transpose weight: [1,2,3,4] -> [1,2,4,3]
|
||||
int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
|
||||
bcast_weight_ne[2], bcast_weight_ne[3],
|
||||
bcast_weight_ne[4], bcast_weight_ne[5]};
|
||||
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
|
||||
bcast_weight_nb[2], bcast_weight_nb[3],
|
||||
bcast_weight_nb[4], bcast_weight_nb[5]};
|
||||
int64_t n_dims = bcast_dims;
|
||||
if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
|
||||
if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
|
||||
n_dims = 2;
|
||||
} else if (bcast_input_ne[2] == 1) {
|
||||
n_dims = 3;
|
||||
}
|
||||
}
|
||||
|
||||
aclTensor* acl_weight_tensor =
|
||||
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
|
||||
aclTensor* acl_input_tensor =
|
||||
ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input));
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst));
|
||||
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
|
||||
int64_t transpose_ne[] = {
|
||||
bcast_weight_ne[1], bcast_weight_ne[0],
|
||||
bcast_weight_ne[2], bcast_weight_ne[3],
|
||||
bcast_weight_ne[4], bcast_weight_ne[5]
|
||||
};
|
||||
size_t transpose_nb[] = {
|
||||
bcast_weight_nb[1], bcast_weight_nb[0],
|
||||
bcast_weight_nb[2], bcast_weight_nb[3],
|
||||
bcast_weight_nb[4], bcast_weight_nb[5]
|
||||
};
|
||||
aclTensor* acl_weight_tensor =
|
||||
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
|
||||
aclTensor* acl_dst =
|
||||
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
||||
|
||||
switch (n_dims) {
|
||||
case 2:
|
||||
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
case 3:
|
||||
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
default:
|
||||
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
}
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
|
@ -2503,46 +2599,40 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
|||
ggml_tensor* src0 = dst->src[0]; // weight
|
||||
ggml_tensor* src1 = dst->src[1]; // input
|
||||
|
||||
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
|
||||
// is regarded as batch. weight need transpose.
|
||||
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
|
||||
// The shape of the weight is NCHW.
|
||||
// Matrix multiplication uses HW dims.
|
||||
// HC is regarded as batch.
|
||||
// weight need transpose.
|
||||
float weight_elem_size;
|
||||
if (type == GGML_TYPE_Q4_0) {
|
||||
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
||||
}
|
||||
else if (type == GGML_TYPE_Q8_0) {
|
||||
} else if (type == GGML_TYPE_Q8_0) {
|
||||
weight_elem_size = float(sizeof(uint8_t));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
|
||||
}
|
||||
float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
|
||||
|
||||
// size of one matrix is element_size * height * width.
|
||||
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
|
||||
float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
|
||||
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
|
||||
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
|
||||
|
||||
// scale stored at the end of weight. Also need transpose.
|
||||
GGML_ASSERT(QK4_0 == QK8_0);
|
||||
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
|
||||
size_t scale_elem_size = sizeof(uint16_t);
|
||||
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
|
||||
scale_elem_size};
|
||||
size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
|
||||
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
|
||||
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
||||
char* scale_offset = (char*)src0->data + weight_size;
|
||||
|
||||
// input
|
||||
void* input_buffer;
|
||||
size_t input_elem_size = sizeof(uint16_t);
|
||||
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
|
||||
size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
|
||||
size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
|
||||
|
||||
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
|
||||
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
|
||||
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
||||
void* input_buffer = src1->data;
|
||||
|
||||
// case in
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
||||
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
||||
input_buffer = input_alloctor.get();
|
||||
input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
||||
|
||||
int64_t* input_cast_ne = src1->ne;
|
||||
size_t input_cast_nb[GGML_MAX_DIMS];
|
||||
|
@ -2552,88 +2642,139 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
|||
}
|
||||
|
||||
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
||||
input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
|
||||
input_cast_nb, GGML_MAX_DIMS);
|
||||
input_buffer,
|
||||
ACL_FLOAT16,
|
||||
input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
|
||||
} else {
|
||||
input_buffer = src1->data;
|
||||
}
|
||||
|
||||
// output
|
||||
size_t output_elem_size = sizeof(uint16_t);
|
||||
int64_t output_ne[] = {dst->ne[0], dst->ne[1]};
|
||||
size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]};
|
||||
ggml_cann_pool_alloc output_alloctor(
|
||||
ctx.pool(), ggml_nelements(dst) * output_elem_size);
|
||||
void* output_buffer = output_alloctor.get();
|
||||
size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
|
||||
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
|
||||
ggml_cann_pool_alloc output_allocator(ctx.pool());
|
||||
void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
|
||||
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
|
||||
|
||||
// aclnn
|
||||
int64_t max_elem_size = 65535;
|
||||
int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
|
||||
aclOpExecutor* executor = nullptr;
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
|
||||
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
|
||||
int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
|
||||
int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
|
||||
|
||||
int64_t batch1 = n1 * src1->ne[2] + c1;
|
||||
int64_t batch0 = n0 * src0->ne[2] + c0;
|
||||
int64_t batch1 = (n1 * src1->ne[2]) + c1;
|
||||
int64_t batch0 = (n0 * src0->ne[2]) + c0;
|
||||
|
||||
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
||||
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
|
||||
input_elem_size, input_ne, input_nb, 2);
|
||||
|
||||
// first split
|
||||
int64_t weight_ne_offset = 0;
|
||||
int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
|
||||
int64_t scale_ne_offset = 0;
|
||||
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
|
||||
int64_t output_ne_offset = 0;
|
||||
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
|
||||
|
||||
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
||||
(char*)src0->data + batch0 * weight_stride,
|
||||
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
|
||||
weight_nb, 2);
|
||||
ggml_cann_type_mapping(type),
|
||||
weight_elem_size, weight_ne, weight_nb, 2,
|
||||
ACL_FORMAT_ND, weight_ne_offset);
|
||||
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
||||
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
||||
scale_elem_size, scale_ne, scale_nb, 2);
|
||||
scale_offset + batch0 * scale_stride,
|
||||
ACL_FLOAT16,
|
||||
scale_elem_size, scale_ne, scale_nb, 2,
|
||||
ACL_FORMAT_ND, scale_ne_offset);
|
||||
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
||||
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2);
|
||||
(char*)output_buffer + batch1 * output_stride,
|
||||
ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2,
|
||||
ACL_FORMAT_ND, output_ne_offset);
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
||||
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
|
||||
&workspaceSize, &executor));
|
||||
|
||||
if (workspaceSize > 0 && workspaceAddr == nullptr) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
|
||||
workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
||||
acl_output_tensor, &workspaceSize, &executor));
|
||||
if (workspaceAddr == nullptr) {
|
||||
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
||||
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||
|
||||
// other splits
|
||||
for (int64_t split = 1; split < split_size; split++) {
|
||||
weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
|
||||
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
|
||||
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
|
||||
scale_ne[0] = weight_ne[0];
|
||||
output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
|
||||
output_ne[0] = weight_ne[0];
|
||||
|
||||
acl_weight_tensor = ggml_cann_create_tensor(
|
||||
(char*)src0->data + batch0 * weight_stride,
|
||||
ggml_cann_type_mapping(type),
|
||||
weight_elem_size, weight_ne, weight_nb, 2,
|
||||
ACL_FORMAT_ND, weight_ne_offset);
|
||||
acl_scale_tensor = ggml_cann_create_tensor(
|
||||
scale_offset + batch0 * scale_stride,
|
||||
ACL_FLOAT16,
|
||||
scale_elem_size, scale_ne, scale_nb, 2,
|
||||
ACL_FORMAT_ND, scale_ne_offset);
|
||||
acl_output_tensor = ggml_cann_create_tensor(
|
||||
(char*)output_buffer + batch1 * output_stride,
|
||||
ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2,
|
||||
ACL_FORMAT_ND, output_ne_offset);
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
||||
acl_output_tensor, &workspaceSize, &executor));
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
||||
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||
}
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
}
|
||||
}
|
||||
|
||||
// cast out
|
||||
int64_t* output_cast_ne = dst->ne;
|
||||
size_t output_cast_nb[GGML_MAX_DIMS];
|
||||
output_cast_nb[0] = sizeof(uint16_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
|
||||
if (dst->type != GGML_TYPE_F16) {
|
||||
int64_t* output_cast_ne = dst->ne;
|
||||
size_t output_cast_nb[GGML_MAX_DIMS];
|
||||
output_cast_nb[0] = sizeof(uint16_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
|
||||
}
|
||||
|
||||
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
||||
output_buffer,
|
||||
ACL_FLOAT16,
|
||||
output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
||||
}
|
||||
|
||||
aclTensor* acl_output_tensor =
|
||||
ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
|
||||
output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
||||
}
|
||||
|
||||
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
|
@ -2859,15 +3000,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
|||
ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
||||
const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
|
||||
int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
|
||||
aclOpExecutor** executor);
|
||||
aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
|
||||
uint64_t workspaceSize,
|
||||
aclOpExecutor* executor,
|
||||
aclrtStream stream);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
// TODO: use ascendc
|
||||
// Only test with LLAMA model.
|
||||
ggml_tensor* src0 = dst->src[0]; // input
|
||||
ggml_tensor* src2 = dst->src[2]; // freq_factors
|
||||
|
||||
// TODO: with freq_factors
|
||||
GGML_ASSERT(src2 == NULL);
|
||||
|
||||
// param
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
// const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
|
@ -2885,13 +3038,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
|
||||
|
||||
GGML_ASSERT(n_dims <= ne0);
|
||||
// TODO: with freq_factors
|
||||
GGML_ASSERT(src2 == NULL);
|
||||
// TODO: attn_factor != 1
|
||||
GGML_ASSERT(attn_factor == 1);
|
||||
// TODO: n_dims <= ne0
|
||||
GGML_ASSERT(n_dims == ne0);
|
||||
GGML_ASSERT(n_dims % 2 == 0);
|
||||
|
||||
// TODO: ext_factor != 0
|
||||
GGML_ASSERT(ext_factor == 0);
|
||||
// TODO: freq_scale != 1
|
||||
GGML_ASSERT(freq_scale == 1);
|
||||
// TODO: type == GGML_TYPE_F16
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
|
@ -2924,177 +3083,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|||
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
|
||||
theta_scale, is_neox);
|
||||
|
||||
// roll input
|
||||
void* input_roll_buffer;
|
||||
aclTensor* acl_minus_one_tensor;
|
||||
void* minus_one_scale_buffer = nullptr;
|
||||
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
|
||||
ggml_cann_pool_alloc minus_one_scale_allocator(
|
||||
ctx.pool(), sizeof(float_t) * src0->ne[0]);
|
||||
if (!is_neox) {
|
||||
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
|
||||
input_roll_buffer = roll_allocator.get();
|
||||
int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
|
||||
src0->ne[2], src0->ne[3]};
|
||||
size_t input_roll_nb[GGML_MAX_DIMS];
|
||||
input_roll_nb[0] = ggml_type_size(src0->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
|
||||
}
|
||||
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
|
||||
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
|
||||
GGML_MAX_DIMS);
|
||||
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
||||
src0->data, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
|
||||
GGML_MAX_DIMS);
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
|
||||
int64_t shifts[] = {1};
|
||||
int64_t dims[] = {3};
|
||||
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
// init [-1, 1, -1, 1, ...]
|
||||
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
||||
|
||||
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
||||
size_t minus_one_nb[GGML_MAX_DIMS];
|
||||
minus_one_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
||||
}
|
||||
acl_minus_one_tensor = aclnn_ones(
|
||||
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
||||
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
||||
int64_t dim = 3;
|
||||
int64_t* index = new int64_t[src0->ne[0]];
|
||||
for (int i = 0; i < src0->ne[0]; i++) {
|
||||
index[i] = i / 2 * 2;
|
||||
}
|
||||
int64_t index_num = src0->ne[0];
|
||||
float value = -1;
|
||||
aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
|
||||
index_num, value);
|
||||
} else {
|
||||
// roll input: [q0,q1,q2,...] ->
|
||||
// [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
|
||||
input_roll_buffer = roll_allocator.get();
|
||||
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
|
||||
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
|
||||
|
||||
int64_t shifts[] = {src0->ne[0] / 2};
|
||||
int64_t dims[] = {3};
|
||||
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
|
||||
// init [-1, -1, -1, 1, 1,1,...]
|
||||
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
||||
|
||||
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
||||
size_t minus_one_nb[GGML_MAX_DIMS];
|
||||
minus_one_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
||||
}
|
||||
acl_minus_one_tensor = aclnn_ones(
|
||||
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
||||
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
||||
// -1 * first half
|
||||
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
|
||||
size_t first_half_nb[GGML_MAX_DIMS];
|
||||
first_half_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
|
||||
}
|
||||
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
|
||||
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
|
||||
first_half_nb, GGML_MAX_DIMS);
|
||||
bool inplace = true;
|
||||
float scale = -1;
|
||||
aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
|
||||
ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
|
||||
int acl_mode = mode;
|
||||
if (mode == 0) {
|
||||
acl_mode = 1;
|
||||
}
|
||||
|
||||
// TODO: n_dims < ne0
|
||||
GGML_ASSERT(n_dims == src0->ne[0]);
|
||||
|
||||
// input * scale
|
||||
ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
|
||||
ggml_nbytes(src0));
|
||||
void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
|
||||
size_t input_nb[GGML_MAX_DIMS];
|
||||
input_nb[0] = ggml_type_size(src0->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
|
||||
input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
|
||||
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
|
||||
|
||||
aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
|
||||
acl_input_roll_mul_scale_tensor);
|
||||
|
||||
// output
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_x = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
void* output_fp32_buffer;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor);
|
||||
aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
|
||||
acl_sin_reshape_tensor);
|
||||
aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
|
||||
// TODO: ne0 != n_dims in mode2
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
size_t input_fp32_nb[GGML_MAX_DIMS];
|
||||
input_fp32_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
|
||||
}
|
||||
ggml_cann_pool_alloc fp32_allocator1(
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||
void* input_fp32_buffer1 = fp32_allocator1.get();
|
||||
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
|
||||
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||
input_fp32_nb, GGML_MAX_DIMS);
|
||||
ggml_cann_pool_alloc fp32_allocator2(
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||
void* input_fp32_buffer2 = fp32_allocator2.get();
|
||||
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
|
||||
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||
input_fp32_nb, GGML_MAX_DIMS);
|
||||
|
||||
ggml_cann_pool_alloc fp32_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||
output_fp32_buffer = fp32_allocator.get();
|
||||
aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
|
||||
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||
input_fp32_nb, GGML_MAX_DIMS);
|
||||
aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
|
||||
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
|
||||
input_fp32_tensor2);
|
||||
aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
|
||||
output_fp32_tensor);
|
||||
aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
|
||||
ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
|
||||
ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
|
||||
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
||||
acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
||||
ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
|
||||
executor, ctx.stream()));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_x));
|
||||
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_src0));
|
||||
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
}
|
||||
|
|
|
@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
|
|||
struct ggml_backend_cann_context {
|
||||
int32_t device; /**< Device ID. */
|
||||
std::string name; /**< Name of the device. */
|
||||
std::string description; /**< Description of the device. */
|
||||
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
||||
|
||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
|
||||
{nullptr}}; /**< Array of streams for the device. */
|
||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
|
||||
|
||||
/**
|
||||
* @brief Constructor for initializing the context with a given device.
|
||||
* @param device Device ID.
|
||||
*/
|
||||
explicit ggml_backend_cann_context(int device)
|
||||
: device(device), name("CANN" + std::to_string(device)) {}
|
||||
: device(device), name("CANN" + std::to_string(device)) {
|
||||
ggml_cann_set_device(device);
|
||||
description = aclrtGetSocName();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Destructor for cleaning up resources.
|
||||
|
|
|
@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
|
|||
ACL_CHECK(aclrtMemGetAllocationGranularity(
|
||||
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
|
||||
&info.devices[id].vmm_granularity));
|
||||
|
||||
size_t free, total;
|
||||
ggml_backend_cann_get_device_memory(id, &free, &total);
|
||||
info.devices[id].total_vram = free;
|
||||
}
|
||||
|
||||
// TODO: add more device info later.
|
||||
|
@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
|||
* @return A pointer to the allocated buffer.
|
||||
*/
|
||||
void* alloc(size_t size, size_t* actual_size) override {
|
||||
const size_t alignment = 128;
|
||||
size = GGML_PAD(size, alignment);
|
||||
if (size == 0) {
|
||||
size = alignment;
|
||||
}
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
int nnz = 0;
|
||||
size_t max_size = 0;
|
||||
|
@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
|||
return ptr;
|
||||
}
|
||||
void* ptr;
|
||||
size_t look_ahead_size = (size_t)(1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
|
||||
ggml_cann_set_device(device);
|
||||
ACL_CHECK(
|
||||
aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
*actual_size = size;
|
||||
pool_size += size;
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO(
|
||||
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
||||
|
@ -296,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|||
/**
|
||||
* @brief The maximum size of the virtual memory pool (32 GB).
|
||||
*/
|
||||
static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||
size_t max_size;
|
||||
|
||||
/**
|
||||
* @brief The device ID associated with this buffer pool.
|
||||
|
@ -341,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|||
*/
|
||||
explicit ggml_cann_pool_vmm(int device)
|
||||
: device(device),
|
||||
granularity(ggml_cann_info().devices[device].vmm_granularity) {}
|
||||
granularity(ggml_cann_info().devices[device].vmm_granularity) {
|
||||
auto dev = ggml_cann_info().devices[device];
|
||||
granularity = dev.vmm_granularity;
|
||||
max_size = dev.total_vram;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Destructor to free all buffers in the virtual memory pool.
|
||||
|
@ -370,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|||
// round up the allocation size to the alignment to ensure that all
|
||||
// allocations are aligned for all data types
|
||||
const size_t alignment = 128;
|
||||
size = alignment * ((size + alignment - 1) / alignment);
|
||||
size = GGML_PAD(size, alignment);
|
||||
if (size == 0) {
|
||||
size = alignment;
|
||||
}
|
||||
|
||||
size_t avail = pool_size - pool_used;
|
||||
|
||||
if (size > avail) {
|
||||
// round up to the next multiple of the granularity
|
||||
size_t reserve_size = size - avail;
|
||||
reserve_size =
|
||||
granularity * ((reserve_size + granularity - 1) / granularity);
|
||||
reserve_size = GGML_PAD(reserve_size, granularity);
|
||||
|
||||
GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE);
|
||||
GGML_ASSERT(pool_size + reserve_size <= max_size);
|
||||
|
||||
// allocate more physical memory
|
||||
aclrtPhysicalMemProp prop = {};
|
||||
|
@ -396,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|||
// reserve virtual address space (if not already reserved)
|
||||
if (pool_addr == 0) {
|
||||
ACL_CHECK(aclrtReserveMemAddress(
|
||||
&pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1));
|
||||
&pool_addr, max_size, 0, NULL, 1));
|
||||
}
|
||||
|
||||
// map at the end of the pool
|
||||
|
@ -409,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|||
// add to the pool
|
||||
pool_size += reserve_size;
|
||||
|
||||
// GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
|
||||
// reserved %llu MB)\n",
|
||||
// device, (unsigned long long) (pool_size/1024/1024),
|
||||
// (unsigned long long) (reserve_size/1024/1024));
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
||||
device, (unsigned long long) (pool_size/1024/1024),
|
||||
(unsigned long long) (reserve_size/1024/1024));
|
||||
#endif
|
||||
}
|
||||
|
||||
GGML_ASSERT(pool_addr != 0);
|
||||
|
@ -457,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|||
*/
|
||||
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
||||
int device) {
|
||||
// return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
|
||||
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
||||
}
|
||||
|
||||
|
@ -1130,10 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
|||
static bool ggml_backend_cann_buffer_type_initialized = false;
|
||||
|
||||
if (!ggml_backend_cann_buffer_type_initialized) {
|
||||
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
|
||||
for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
|
||||
ggml_backend_cann_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_cann_buffer_type_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
|
||||
/* .context = */
|
||||
new ggml_backend_cann_buffer_type_context{
|
||||
i, "CANN" + std::to_string(i)},
|
||||
|
@ -1199,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
const size_t alignment = 128;
|
||||
size = GGML_PAD(size, alignment);
|
||||
if (size == 0) {
|
||||
size = alignment;
|
||||
}
|
||||
|
||||
void * hostPtr = nullptr;
|
||||
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
||||
if (err != ACL_SUCCESS) {
|
||||
|
||||
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
||||
return nullptr;
|
||||
|
@ -1669,12 +1687,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|||
}
|
||||
case GGML_OP_MUL_MAT: {
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
// Current groupsize should not be greater than k-1 in
|
||||
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
|
||||
if (op->src[0]->ne[0] <= QK8_0) {
|
||||
return false;
|
||||
}
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q8_0:
|
||||
// TODO: fix me
|
||||
// Current groupsize should not be greater than k-1 in
|
||||
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
|
||||
case GGML_TYPE_Q4_0:
|
||||
return true;
|
||||
default:
|
||||
|
@ -1706,9 +1726,61 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|||
return false;
|
||||
}
|
||||
}
|
||||
case GGML_OP_CONT: {
|
||||
// TODO: support GGML_TYPE_BF16
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case GGML_OP_ROPE: {
|
||||
// TODO: with ops-test v == 1
|
||||
float * freq_scale = (float*)((int32_t*)op->op_params + 6);
|
||||
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
|
||||
float * attn_factor = (float*)((int32_t*)op->op_params + 8);
|
||||
// TODO: with freq_factors
|
||||
if (op->src[2] != NULL) {
|
||||
return false;
|
||||
}
|
||||
// TODO: n_dims <= ne0
|
||||
if (op->src[0]->ne[0] != op->op_params[1]) {
|
||||
return false;
|
||||
}
|
||||
// TODO: ext_factor != 0
|
||||
if (*ext_factor != 0) {
|
||||
return false;
|
||||
}
|
||||
// TODO: freq_scale != 1
|
||||
if (*freq_scale != 1) {
|
||||
return false;
|
||||
}
|
||||
// TODO: attn_factor != 1
|
||||
if (*attn_factor != 1) {
|
||||
return false;
|
||||
}
|
||||
//TODO: type == GGML_TYPE_F16
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case GGML_OP_UPSCALE: {
|
||||
// aclnnUpsampleNearest2dGetWorkspaceSize not support
|
||||
// selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
|
||||
if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
|
@ -1722,17 +1794,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
|
|
|
@ -96,6 +96,39 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
|||
endif ()
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
|
||||
elseif (APPLE)
|
||||
if (GGML_NATIVE)
|
||||
set(USER_PROVIDED_MARCH FALSE)
|
||||
foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
|
||||
if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
|
||||
set(USER_PROVIDED_MARCH TRUE)
|
||||
break()
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
if (NOT USER_PROVIDED_MARCH)
|
||||
set(MARCH_FLAGS "-march=armv8.2a")
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
|
||||
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
||||
endif ()
|
||||
|
||||
set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
|
||||
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
|
||||
endif ()
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
||||
|
||||
list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
|
||||
endif ()
|
||||
endif ()
|
||||
else()
|
||||
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||
|
|
|
@ -474,9 +474,15 @@ void write_output_files() {
|
|||
|
||||
int main(int argc, char** argv) {
|
||||
std::map<std::string, std::string> args;
|
||||
for (int i = 1; i < argc; i += 2) {
|
||||
if (i + 1 < argc) {
|
||||
args[argv[i]] = argv[i + 1];
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
std::string arg = argv[i];
|
||||
if (arg.rfind("--", 0) == 0) {
|
||||
if (i + 1 < argc && argv[i + 1][0] != '-') {
|
||||
args[arg] = argv[i + 1];
|
||||
++i;
|
||||
} else {
|
||||
args[arg] = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
25
include/llama-cpp.h
Normal file
25
include/llama-cpp.h
Normal file
|
@ -0,0 +1,25 @@
|
|||
#pragma once
|
||||
|
||||
#ifndef __cplusplus
|
||||
#error "This header is for C++ only"
|
||||
#endif
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
struct llama_model_deleter {
|
||||
void operator()(llama_model * model) { llama_free_model(model); }
|
||||
};
|
||||
|
||||
struct llama_context_deleter {
|
||||
void operator()(llama_context * context) { llama_free(context); }
|
||||
};
|
||||
|
||||
struct llama_sampler_deleter {
|
||||
void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
|
||||
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
|
||||
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
|
Loading…
Add table
Add a link
Reference in a new issue