Merge branch 'master' into gg/cmake-warnings

This commit is contained in:
Georgi Gerganov 2024-11-26 14:16:08 +02:00 committed by GitHub
commit c254ee763a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 1012 additions and 385 deletions

15
.github/labeler.yml vendored
View file

@ -3,19 +3,18 @@ Kompute:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml-kompute.h - ggml/include/ggml-kompute.h
- ggml/src/ggml-kompute.cpp - ggml/src/ggml-kompute/**
- README-kompute.md - README-kompute.md
Apple Metal: Apple Metal:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml-metal.h - ggml/include/ggml-metal.h
- ggml/src/ggml-metal.cpp - ggml/src/ggml-metal/**
- README-metal.md - README-metal.md
SYCL: SYCL:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml-sycl.h - ggml/include/ggml-sycl.h
- ggml/src/ggml-sycl.cpp
- ggml/src/ggml-sycl/** - ggml/src/ggml-sycl/**
- docs/backend/SYCL.md - docs/backend/SYCL.md
- examples/sycl/** - examples/sycl/**
@ -27,8 +26,8 @@ Nvidia GPU:
Vulkan: Vulkan:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/ggml_vk_generate_shaders.py - ggml/include/ggml-vulkan.h
- ggml/src/ggml-vulkan* - ggml/src/ggml-vulkan/**
documentation: documentation:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
@ -75,11 +74,7 @@ server:
ggml: ggml:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:
- ggml/include/ggml*.h - ggml/**
- ggml/src/ggml*.c
- ggml/src/ggml*.cpp
- ggml/src/ggml*.h
- ggml-cuda/**
nix: nix:
- changed-files: - changed-files:
- any-glob-to-any-file: - any-glob-to-any-file:

View file

@ -871,8 +871,65 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
name: llama-bin-win-${{ matrix.build }}.zip name: llama-bin-win-${{ matrix.build }}.zip
ubuntu-latest-cmake-cuda:
runs-on: ubuntu-latest
container: nvidia/cuda:12.6.2-devel-ubuntu24.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Install dependencies
env:
DEBIAN_FRONTEND: noninteractive
run: |
apt update
apt install -y cmake build-essential ninja-build libgomp1 git
- name: Build with CMake
run: |
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
cmake --build build
windows-latest-cmake-cuda: windows-latest-cmake-cuda:
runs-on: windows-latest
strategy:
matrix:
cuda: ['12.6.2']
build: ['cuda']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Install CUDA toolkit
id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.19
with:
cuda: ${{ matrix.cuda }}
method: 'network'
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: Build
id: cmake_build
shell: cmd
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON -DCMAKE_CUDA_ARCHITECTURES=89-real
cmake --build build --config Release -t ggml-cuda
cmake --build build --config Release
windows-2019-cmake-cuda:
runs-on: windows-2019 runs-on: windows-2019
if: ${{ github.event == 'push' && github.ref == 'refs/heads/master' }}
strategy: strategy:
matrix: matrix:
@ -1173,7 +1230,7 @@ jobs:
- macOS-latest-make - macOS-latest-make
- macOS-latest-cmake - macOS-latest-cmake
- windows-latest-cmake - windows-latest-cmake
- windows-latest-cmake-cuda - windows-2019-cmake-cuda
- windows-latest-cmake-hip-release - windows-latest-cmake-hip-release
- macOS-latest-cmake-arm64 - macOS-latest-cmake-arm64
- macOS-latest-cmake-x64 - macOS-latest-cmake-x64

View file

@ -10,12 +10,10 @@
name: Publish Docker image name: Publish Docker image
on: on:
#pull_request: workflow_dispatch: # allows manual triggering
push: schedule:
branches: # Rebuild daily rather than on every push because it is expensive
- master - cron: '12 4 * * *'
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
workflow_dispatch: # allows manual triggering, useful for debugging
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -29,7 +27,6 @@ permissions:
jobs: jobs:
push_to_registry: push_to_registry:
name: Push Docker image to Docker Hub name: Push Docker image to Docker Hub
#if: github.event.pull_request.draft == false
runs-on: ubuntu-latest runs-on: ubuntu-latest
env: env:
@ -117,7 +114,7 @@ jobs:
swap-storage: true swap-storage: true
- name: Build and push Docker image (tagged + versioned) - name: Build and push Docker image (tagged + versioned)
if: github.event_name == 'push' if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
uses: docker/build-push-action@v6 uses: docker/build-push-action@v6
with: with:
context: . context: .

View file

@ -5,8 +5,10 @@ on:
push: push:
branches: branches:
- master - master
paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}

View file

@ -1,6 +1,13 @@
name: flake8 Lint name: flake8 Lint
on: [push, pull_request] on:
push:
branches:
- master
paths: ['.github/workflows/python-lint.yml', '**/*.py']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/python-lint.yml', '**/*.py']
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}

View file

@ -164,8 +164,11 @@ if (GGML_TARGET_DEFINES)
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES}) list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
endif() endif()
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
# all public headers
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) set(LLAMA_PUBLIC_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
install(TARGETS llama LIBRARY PUBLIC_HEADER) install(TARGETS llama LIBRARY PUBLIC_HEADER)
configure_package_config_file( configure_package_config_file(

View file

@ -34,6 +34,7 @@ BUILD_TARGETS = \
llama-server \ llama-server \
llama-simple \ llama-simple \
llama-simple-chat \ llama-simple-chat \
llama-run \
llama-speculative \ llama-speculative \
llama-tokenize \ llama-tokenize \
llama-vdot \ llama-vdot \
@ -1167,6 +1168,11 @@ llama-infill: examples/infill/infill.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-run: examples/run/run.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-simple: examples/simple/simple.cpp \ llama-simple: examples/simple/simple.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

View file

@ -90,9 +90,10 @@ bool common_speculative_are_compatible(
if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) || llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
llama_token_eos(model_tgt) != llama_token_eos(model_dft) llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
) {
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
return false; return false;
} }

View file

@ -40,6 +40,7 @@ else()
add_subdirectory(server) add_subdirectory(server)
endif() endif()
add_subdirectory(save-load-state) add_subdirectory(save-load-state)
add_subdirectory(run)
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(simple-chat) add_subdirectory(simple-chat)
add_subdirectory(speculative) add_subdirectory(speculative)

View file

@ -0,0 +1,5 @@
set(TARGET llama-run)
add_executable(${TARGET} run.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

7
examples/run/README.md Normal file
View file

@ -0,0 +1,7 @@
# llama.cpp/example/run
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
```bash
./llama-run Meta-Llama-3.1-8B-Instruct.gguf
...

409
examples/run/run.cpp Normal file
View file

@ -0,0 +1,409 @@
#if defined(_WIN32)
#include <windows.h>
#else
#include <unistd.h>
#endif
#include <climits>
#include <cstdio>
#include <cstring>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "llama-cpp.h"
typedef std::unique_ptr<char[]> char_array_ptr;
struct Argument {
std::string flag;
std::string help_text;
};
struct Options {
std::string model_path, prompt_non_interactive;
int ngl = 99;
int n_ctx = 2048;
};
class ArgumentParser {
public:
ArgumentParser(const char * program_name) : program_name(program_name) {}
void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
string_args[flag] = &var;
arguments.push_back({flag, help_text});
}
void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
int_args[flag] = &var;
arguments.push_back({flag, help_text});
}
int parse(int argc, const char ** argv) {
for (int i = 1; i < argc; ++i) {
std::string arg = argv[i];
if (string_args.count(arg)) {
if (i + 1 < argc) {
*string_args[arg] = argv[++i];
} else {
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
print_usage();
return 1;
}
} else if (int_args.count(arg)) {
if (i + 1 < argc) {
if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
print_usage();
return 1;
}
} else {
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
print_usage();
return 1;
}
} else {
fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
print_usage();
return 1;
}
}
if (string_args["-m"]->empty()) {
fprintf(stderr, "error: -m is required\n");
print_usage();
return 1;
}
return 0;
}
private:
const char * program_name;
std::unordered_map<std::string, std::string *> string_args;
std::unordered_map<std::string, int *> int_args;
std::vector<Argument> arguments;
int parse_int_arg(const char * arg, int & value) {
char * end;
const long val = std::strtol(arg, &end, 10);
if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
value = static_cast<int>(val);
return 0;
}
return 1;
}
void print_usage() const {
printf("\nUsage:\n");
printf(" %s [OPTIONS]\n\n", program_name);
printf("Options:\n");
for (const auto & arg : arguments) {
printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
}
printf("\n");
}
};
class LlamaData {
public:
llama_model_ptr model;
llama_sampler_ptr sampler;
llama_context_ptr context;
std::vector<llama_chat_message> messages;
int init(const Options & opt) {
model = initialize_model(opt.model_path, opt.ngl);
if (!model) {
return 1;
}
context = initialize_context(model, opt.n_ctx);
if (!context) {
return 1;
}
sampler = initialize_sampler();
return 0;
}
private:
// Initializes the model and returns a unique pointer to it
llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = ngl;
llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
if (!model) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
}
return model;
}
// Initializes the context with the specified parameters
llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = n_ctx;
ctx_params.n_batch = n_ctx;
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
if (!context) {
fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
}
return context;
}
// Initializes and configures the sampler
llama_sampler_ptr initialize_sampler() {
llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
return sampler;
}
};
// Add a message to `messages` and store its content in `owned_content`
static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
std::vector<char_array_ptr> & owned_content) {
char_array_ptr content(new char[text.size() + 1]);
std::strcpy(content.get(), text.c_str());
llama_data.messages.push_back({role, content.get()});
owned_content.push_back(std::move(content));
}
// Function to apply the chat template and resize `formatted` if needed
static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
llama_data.messages.size(), append, formatted.data(), formatted.size());
if (result > static_cast<int>(formatted.size())) {
formatted.resize(result);
result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
llama_data.messages.size(), append, formatted.data(), formatted.size());
}
return result;
}
// Function to tokenize the prompt
static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
std::vector<llama_token> & prompt_tokens) {
const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
prompt_tokens.resize(n_prompt_tokens);
if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
true) < 0) {
GGML_ABORT("failed to tokenize the prompt\n");
}
return n_prompt_tokens;
}
// Check if we have enough space in the context to evaluate this batch
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
const int n_ctx = llama_n_ctx(ctx.get());
const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
if (n_ctx_used + batch.n_tokens > n_ctx) {
printf("\033[0m\n");
fprintf(stderr, "context size exceeded\n");
return 1;
}
return 0;
}
// convert the token to a string
static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
char buf[256];
int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
if (n < 0) {
GGML_ABORT("failed to convert token to piece\n");
}
piece = std::string(buf, n);
return 0;
}
static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) {
printf("%s", piece.c_str());
fflush(stdout);
response += piece;
}
// helper function to evaluate a prompt and generate a response
static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
std::vector<llama_token> prompt_tokens;
const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
if (n_prompt_tokens < 0) {
return 1;
}
// prepare a batch for the prompt
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
llama_token new_token_id;
while (true) {
check_context_size(llama_data.context, batch);
if (llama_decode(llama_data.context.get(), batch)) {
GGML_ABORT("failed to decode\n");
}
// sample the next token, check is it an end of generation?
new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
break;
}
std::string piece;
if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
return 1;
}
print_word_and_concatenate_to_response(piece, response);
// prepare the next batch with the sampled token
batch = llama_batch_get_one(&new_token_id, 1);
}
return 0;
}
static int parse_arguments(const int argc, const char ** argv, Options & opt) {
ArgumentParser parser(argv[0]);
parser.add_argument("-m", opt.model_path, "model");
parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
parser.add_argument("-c", opt.n_ctx, "context_size");
parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
if (parser.parse(argc, argv)) {
return 1;
}
return 0;
}
static int read_user_input(std::string & user) {
std::getline(std::cin, user);
return user.empty(); // Indicate an error or empty input
}
// Function to generate a response based on the prompt
static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
// Set response color
printf("\033[33m");
if (generate(llama_data, prompt, response)) {
fprintf(stderr, "failed to generate response\n");
return 1;
}
// End response with color reset and newline
printf("\n\033[0m");
return 0;
}
// Helper function to apply the chat template and handle errors
static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
const bool is_user_input, int & output_length) {
const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
if (new_len < 0) {
fprintf(stderr, "failed to apply the chat template\n");
return -1;
}
output_length = new_len;
return 0;
}
// Helper function to handle user input
static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
if (!prompt_non_interactive.empty()) {
user_input = prompt_non_interactive;
return true; // No need for interactive input
}
printf("\033[32m> \033[0m");
return !read_user_input(user_input); // Returns false if input ends the loop
}
// Function to tokenize the prompt
static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
std::vector<char_array_ptr> owned_content;
std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
int prev_len = 0;
while (true) {
// Get user input
std::string user_input;
if (!handle_user_input(user_input, prompt_non_interactive)) {
break;
}
add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
owned_content);
int new_len;
if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
return 1;
}
std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
std::string response;
if (generate_response(llama_data, prompt, response)) {
return 1;
}
}
return 0;
}
static void log_callback(const enum ggml_log_level level, const char * text, void *) {
if (level == GGML_LOG_LEVEL_ERROR) {
fprintf(stderr, "%s", text);
}
}
static bool is_stdin_a_terminal() {
#if defined(_WIN32)
HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
DWORD mode;
return GetConsoleMode(hStdin, &mode);
#else
return isatty(STDIN_FILENO);
#endif
}
static std::string read_pipe_data() {
std::ostringstream result;
result << std::cin.rdbuf(); // Read all data from std::cin
return result.str();
}
int main(int argc, const char ** argv) {
Options opt;
if (parse_arguments(argc, argv, opt)) {
return 1;
}
if (!is_stdin_a_terminal()) {
if (!opt.prompt_non_interactive.empty()) {
opt.prompt_non_interactive += "\n\n";
}
opt.prompt_non_interactive += read_pipe_data();
}
llama_log_set(log_callback, nullptr);
LlamaData llama_data;
if (llama_data.init(opt)) {
return 1;
}
if (chat_loop(llama_data, opt.prompt_non_interactive)) {
return 1;
}
return 0;
}

View file

@ -2267,12 +2267,7 @@ struct server_context {
continue; // continue loop of slots continue; // continue loop of slots
} }
llama_token id; llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
{
completion_token_output result;
id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
slot.i_batch = -1; slot.i_batch = -1;
@ -2285,6 +2280,7 @@ struct server_context {
metrics.on_prompt_eval(slot); metrics.on_prompt_eval(slot);
} }
completion_token_output result;
result.tok = id; result.tok = id;
const auto * cur_p = common_sampler_get_candidates(slot.smpl); const auto * cur_p = common_sampler_get_candidates(slot.smpl);
@ -2306,11 +2302,14 @@ struct server_context {
} }
} }
// check if the slot supports speculative decoding // do speculative decoding
if (!slot.can_speculate()) { for (auto & slot : slots) {
if (!slot.is_processing() || !slot.can_speculate()) {
continue; continue;
} }
llama_token id = slot.sampled;
struct common_speculative_params params_spec; struct common_speculative_params params_spec;
params_spec.n_draft = slot.params.speculative.n_max; params_spec.n_draft = slot.params.speculative.n_max;
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max; params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;

View file

@ -117,7 +117,8 @@ int main(int argc, char ** argv) {
llama_token id_last = inp.back(); llama_token id_last = inp.back();
// all tokens currently in the target context // all tokens currently in the target context
auto prompt_tgt = std::vector<llama_token>(inp.begin(), inp.end() - 1); llama_tokens prompt_tgt(inp.begin(), inp.end() - 1);
prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
int n_past = inp.size() - 1; int n_past = inp.size() - 1;
@ -181,29 +182,26 @@ int main(int argc, char ** argv) {
GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
n_past += ids.size() - 1; n_past += ids.size() - 1;
n_drafted += batch_tgt.n_tokens - 1; n_drafted += draft.size(); // note: we ignore the discarded small drafts
n_accept += ids.size() - 1; n_accept += ids.size() - 1;
n_predict += ids.size();
// process the accepted tokens and update contexts // process the accepted tokens and update contexts
// //
// this is the standard token post-processing that we normally do // this is the standard token post-processing that we normally do
// in this case, we do it for a group of accepted tokens at once // in this case, we do it for a group of accepted tokens at once
// //
{
llama_token id = 0;
std::string token_str;
for (size_t i = 0; i < ids.size(); ++i) { for (size_t i = 0; i < ids.size(); ++i) {
id = ids[i]; prompt_tgt.push_back(id_last);
++n_predict; id_last = ids[i];
if (llama_token_is_eog(model_tgt, id)) { if (llama_token_is_eog(model_tgt, id_last)) {
has_eos = true; has_eos = true;
break; break;
} }
token_str = common_token_to_piece(ctx_tgt, id); const std::string token_str = common_token_to_piece(ctx_tgt, id_last);
if (params.use_color && i + 1 < ids.size()) { if (params.use_color && i + 1 < ids.size()) {
LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str()); LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
@ -212,11 +210,7 @@ int main(int argc, char ** argv) {
} }
} }
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
break;
}
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str());
{ {
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
@ -224,11 +218,8 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1); llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
} }
prompt_tgt.push_back(id_last); if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1); break;
// remember the last accepted token for the next iteration
id_last = id;
} }
} }

View file

@ -21,6 +21,7 @@
*/ */
#include "aclnn_ops.h" #include "aclnn_ops.h"
#include "ggml-impl.h"
#include <aclnnop/aclnn_avgpool2d.h> #include <aclnnop/aclnn_avgpool2d.h>
#include <aclnnop/aclnn_cast.h> #include <aclnnop/aclnn_cast.h>
@ -32,6 +33,8 @@
#include <aclnnop/aclnn_group_norm.h> #include <aclnnop/aclnn_group_norm.h>
#include <aclnnop/aclnn_index_fill_tensor.h> #include <aclnnop/aclnn_index_fill_tensor.h>
#include <aclnnop/aclnn_layer_norm.h> #include <aclnnop/aclnn_layer_norm.h>
#include <aclnnop/aclnn_mm.h>
#include <aclnnop/aclnn_batch_matmul.h>
#include <aclnnop/aclnn_matmul.h> #include <aclnnop/aclnn_matmul.h>
#include <aclnnop/aclnn_max_pool.h> #include <aclnnop/aclnn_max_pool.h>
#include <aclnnop/aclnn_permute.h> #include <aclnnop/aclnn_permute.h>
@ -241,10 +244,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
aclTensor* acl_src1 = ggml_cann_create_tensor(src1); aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
aclTensor* acl_dst = ggml_cann_create_tensor(dst); aclTensor* acl_dst = ggml_cann_create_tensor(dst);
int64_t concat_dim = 1; const int32_t dim = ggml_get_op_params_i32(dst, 0);
GGML_ASSERT(dim >= 0 && dim < 4);
int32_t acl_dim = 3 - dim;
aclTensor* tensors[] = {acl_src0, acl_src1}; aclTensor* tensors[] = {acl_src0, acl_src1};
aclTensorList* tensorList = aclCreateTensorList(tensors, 2); aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
aclnn_concat(ctx, tensorList, acl_dst, concat_dim); aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
ACL_CHECK(aclDestroyTensorList(tensorList)); ACL_CHECK(aclDestroyTensorList(tensorList));
ACL_CHECK(aclDestroyTensor(acl_dst)); ACL_CHECK(aclDestroyTensor(acl_dst));
@ -1437,10 +1444,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src0 = dst->src[0]; // kernel ggml_tensor* src0 = dst->src[0]; // kernel
ggml_tensor* src1 = dst->src[1]; // input ggml_tensor* src1 = dst->src[1]; // input
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS; GGML_TENSOR_BINARY_OP_LOCALS;
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
@ -1462,9 +1465,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
const int64_t OH = is_2D ? ne2 : 1; const int64_t OH = is_2D ? ne2 : 1;
const int64_t OW = ne1; const int64_t OW = ne1;
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
// memory allocated increased to 3x when is_2D == false // memory allocated increased to 3x when is_2D == false
const int64_t n_bytes_factor = is_2D ? 1 : 3; const int64_t n_bytes_factor = is_2D ? 1 : 3;
@ -2425,7 +2425,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclTensor* acl_weight, aclTensor* acl_dst) { aclTensor* acl_weight, aclTensor* acl_dst) {
int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
// fp32, atlas a2 will transpose it to HFLOAT32. // fp32, atlas a2 will transpose it to HFLOAT32.
uint64_t workspaceSize = 0; uint64_t workspaceSize = 0;
aclOpExecutor* executor; aclOpExecutor* executor;
void* workspaceAddr = nullptr; void* workspaceAddr = nullptr;
@ -2443,6 +2442,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream())); aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
} }
/**
* @brief Performs matrix multiplication of two 2D tensors.
*
* This function computes the matrix multiplication of the input tensor
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
* destination tensor `acl_dst`.
* The operation is defined as:
* \f[
* \text {acl_dst}=\text {acl_input@acl_weight}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_input The input tensor for the matrix multiplication.
* @param acl_weight The weight tensor for the matrix multiplication.
* @param acl_dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclTensor* acl_weight, aclTensor* acl_dst) {
int8_t cube_math_type = 2;
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
cube_math_type, &workspaceSize,
&executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(
aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
/**
* @brief Performs matrix multiplication of two 3D tensors.
*
* This function computes the matrix multiplication of the input tensor
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
* destination tensor `acl_dst`.
* The operation is defined as:
* \f[
* \text {acl_dst}=\text {acl_input@acl_weight}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_input The input tensor for the matrix multiplication.
* @param acl_weight The weight tensor for the matrix multiplication.
* @param acl_dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclTensor* acl_weight, aclTensor* acl_dst) {
int8_t cube_math_type = 2;
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
cube_math_type, &workspaceSize,
&executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(
aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
/** /**
* @brief Performs matrix multiplication with floating-point precision on * @brief Performs matrix multiplication with floating-point precision on
* tensors using the CANN backend. * tensors using the CANN backend.
@ -2464,20 +2537,43 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat. // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
BCAST_MUL_MAT_SHAPE(input, weight, dst); BCAST_MUL_MAT_SHAPE(input, weight, dst);
// transpose weight: [1,2,3,4] -> [1,2,4,3] int64_t n_dims = bcast_dims;
int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0], if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
bcast_weight_ne[2], bcast_weight_ne[3], if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
bcast_weight_ne[4], bcast_weight_ne[5]}; n_dims = 2;
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], } else if (bcast_input_ne[2] == 1) {
bcast_weight_nb[2], bcast_weight_nb[3], n_dims = 3;
bcast_weight_nb[4], bcast_weight_nb[5]}; }
}
aclTensor* acl_weight_tensor =
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
aclTensor* acl_input_tensor = aclTensor* acl_input_tensor =
ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input)); ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst)); int64_t transpose_ne[] = {
bcast_weight_ne[1], bcast_weight_ne[0],
bcast_weight_ne[2], bcast_weight_ne[3],
bcast_weight_ne[4], bcast_weight_ne[5]
};
size_t transpose_nb[] = {
bcast_weight_nb[1], bcast_weight_nb[0],
bcast_weight_nb[2], bcast_weight_nb[3],
bcast_weight_nb[4], bcast_weight_nb[5]
};
aclTensor* acl_weight_tensor =
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
aclTensor* acl_dst =
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
switch (n_dims) {
case 2:
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
case 3:
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
default:
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
}
ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_tensor)); ACL_CHECK(aclDestroyTensor(acl_input_tensor));
@ -2503,46 +2599,40 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
ggml_tensor* src0 = dst->src[0]; // weight ggml_tensor* src0 = dst->src[0]; // weight
ggml_tensor* src1 = dst->src[1]; // input ggml_tensor* src1 = dst->src[1]; // input
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC // The shape of the weight is NCHW.
// is regarded as batch. weight need transpose. // Matrix multiplication uses HW dims.
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]}; // HC is regarded as batch.
// weight need transpose.
float weight_elem_size; float weight_elem_size;
if (type == GGML_TYPE_Q4_0) { if (type == GGML_TYPE_Q4_0) {
weight_elem_size = float(sizeof(uint8_t)) / 2; weight_elem_size = float(sizeof(uint8_t)) / 2;
} } else if (type == GGML_TYPE_Q8_0) {
else if (type == GGML_TYPE_Q8_0) {
weight_elem_size = float(sizeof(uint8_t)); weight_elem_size = float(sizeof(uint8_t));
} } else {
else {
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
} }
float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
// size of one matrix is element_size * height * width.
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
// scale stored at the end of weight. Also need transpose. // scale stored at the end of weight. Also need transpose.
GGML_ASSERT(QK4_0 == QK8_0);
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
size_t scale_elem_size = sizeof(uint16_t); size_t scale_elem_size = sizeof(uint16_t);
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
scale_elem_size}; size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
char* scale_offset = (char*)src0->data + weight_size; char* scale_offset = (char*)src0->data + weight_size;
// input // input
void* input_buffer;
size_t input_elem_size = sizeof(uint16_t); size_t input_elem_size = sizeof(uint16_t);
int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]}; size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1]; size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
ggml_cann_pool_alloc input_alloctor(ctx.pool()); ggml_cann_pool_alloc input_alloctor(ctx.pool());
void* input_buffer = src1->data;
// case in
if (src1->type != GGML_TYPE_F16) { if (src1->type != GGML_TYPE_F16) {
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
input_buffer = input_alloctor.get();
int64_t* input_cast_ne = src1->ne; int64_t* input_cast_ne = src1->ne;
size_t input_cast_nb[GGML_MAX_DIMS]; size_t input_cast_nb[GGML_MAX_DIMS];
@ -2552,73 +2642,122 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
} }
aclTensor* acl_input_tensor = ggml_cann_create_tensor( aclTensor* acl_input_tensor = ggml_cann_create_tensor(
input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne, input_buffer,
input_cast_nb, GGML_MAX_DIMS); ACL_FLOAT16,
input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16); aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
ACL_CHECK(aclDestroyTensor(acl_input_tensor)); ACL_CHECK(aclDestroyTensor(acl_input_tensor));
ACL_CHECK(aclDestroyTensor(acl_src1_tensor)); ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
} else {
input_buffer = src1->data;
} }
// output // output
size_t output_elem_size = sizeof(uint16_t); size_t output_elem_size = sizeof(uint16_t);
int64_t output_ne[] = {dst->ne[0], dst->ne[1]}; size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]}; ggml_cann_pool_alloc output_allocator(ctx.pool());
ggml_cann_pool_alloc output_alloctor( void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
ctx.pool(), ggml_nelements(dst) * output_elem_size); size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
void* output_buffer = output_alloctor.get();
size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
// aclnn // aclnn
int64_t max_elem_size = 65535;
int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
aclOpExecutor* executor = nullptr;
uint64_t workspaceSize = 0; uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr; void* workspaceAddr = nullptr;
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) { for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) { for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]); int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]); int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
int64_t batch1 = n1 * src1->ne[2] + c1; int64_t batch1 = (n1 * src1->ne[2]) + c1;
int64_t batch0 = n0 * src0->ne[2] + c0; int64_t batch0 = (n0 * src0->ne[2]) + c0;
aclTensor* acl_input_tensor = ggml_cann_create_tensor( aclTensor* acl_input_tensor = ggml_cann_create_tensor(
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
input_elem_size, input_ne, input_nb, 2); input_elem_size, input_ne, input_nb, 2);
// first split
int64_t weight_ne_offset = 0;
int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
int64_t scale_ne_offset = 0;
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
int64_t output_ne_offset = 0;
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
aclTensor* acl_weight_tensor = ggml_cann_create_tensor( aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride, (char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type), weight_elem_size, weight_ne, ggml_cann_type_mapping(type),
weight_nb, 2); weight_elem_size, weight_ne, weight_nb, 2,
ACL_FORMAT_ND, weight_ne_offset);
aclTensor* acl_scale_tensor = ggml_cann_create_tensor( aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_offset + batch0 * scale_stride,
scale_elem_size, scale_ne, scale_nb, 2); ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2,
ACL_FORMAT_ND, scale_ne_offset);
aclTensor* acl_output_tensor = ggml_cann_create_tensor( aclTensor* acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, (char*)output_buffer + batch1 * output_stride,
output_elem_size, output_ne, output_nb, 2); ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2,
ACL_FORMAT_ND, output_ne_offset);
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor, nullptr, nullptr, nullptr, nullptr, QK8_0,
&workspaceSize, &executor)); acl_output_tensor, &workspaceSize, &executor));
if (workspaceAddr == nullptr) {
if (workspaceSize > 0 && workspaceAddr == nullptr) { workspaceAddr = workspace_allocator.alloc(workspaceSize);
ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
workspaceSize);
workspaceAddr = workspace_allocator.get();
} }
ACL_CHECK(aclnnWeightQuantBatchMatmulV2( ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
workspaceAddr, workspaceSize, executor, ctx.stream())); workspaceAddr, workspaceSize, executor, ctx.stream()));
ACL_CHECK(aclDestroyTensor(acl_input_tensor)); ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
// other splits
for (int64_t split = 1; split < split_size; split++) {
weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
scale_ne[0] = weight_ne[0];
output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
output_ne[0] = weight_ne[0];
acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type),
weight_elem_size, weight_ne, weight_nb, 2,
ACL_FORMAT_ND, weight_ne_offset);
acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride,
ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2,
ACL_FORMAT_ND, scale_ne_offset);
acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride,
ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2,
ACL_FORMAT_ND, output_ne_offset);
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
nullptr, nullptr, nullptr, nullptr, QK8_0,
acl_output_tensor, &workspaceSize, &executor));
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
workspaceAddr, workspaceSize, executor, ctx.stream()));
ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_output_tensor)); ACL_CHECK(aclDestroyTensor(acl_output_tensor));
} }
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
}
} }
// cast out // cast out
if (dst->type != GGML_TYPE_F16) {
int64_t* output_cast_ne = dst->ne; int64_t* output_cast_ne = dst->ne;
size_t output_cast_nb[GGML_MAX_DIMS]; size_t output_cast_nb[GGML_MAX_DIMS];
output_cast_nb[0] = sizeof(uint16_t); output_cast_nb[0] = sizeof(uint16_t);
@ -2626,14 +2765,16 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1]; output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
} }
aclTensor* acl_output_tensor = aclTensor* acl_output_tensor = ggml_cann_create_tensor(
ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size, output_buffer,
output_cast_ne, output_cast_nb, GGML_MAX_DIMS); ACL_FLOAT16,
output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT); aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
ACL_CHECK(aclDestroyTensor(acl_output_tensor)); ACL_CHECK(aclDestroyTensor(acl_output_tensor));
ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
}
} }
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@ -2859,15 +3000,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
ACL_CHECK(aclDestroyTensor(acl_cos_tensor)); ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
} }
#ifdef __cplusplus
extern "C" {
#endif
aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
aclOpExecutor** executor);
aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
uint64_t workspaceSize,
aclOpExecutor* executor,
aclrtStream stream);
#ifdef __cplusplus
}
#endif
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// TODO: use ascendc // TODO: use ascendc
// Only test with LLAMA model. // Only test with LLAMA model.
ggml_tensor* src0 = dst->src[0]; // input ggml_tensor* src0 = dst->src[0]; // input
ggml_tensor* src2 = dst->src[2]; // freq_factors ggml_tensor* src2 = dst->src[2]; // freq_factors
// TODO: with freq_factors
GGML_ASSERT(src2 == NULL);
// param // param
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
// const int n_past = ((int32_t *) dst->op_params)[0]; // const int n_past = ((int32_t *) dst->op_params)[0];
@ -2885,13 +3038,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float)); memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float)); memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
GGML_ASSERT(n_dims <= ne0); // TODO: with freq_factors
GGML_ASSERT(src2 == NULL);
// TODO: attn_factor != 1
GGML_ASSERT(attn_factor == 1);
// TODO: n_dims <= ne0
GGML_ASSERT(n_dims == ne0);
GGML_ASSERT(n_dims % 2 == 0); GGML_ASSERT(n_dims % 2 == 0);
// TODO: ext_factor != 0 // TODO: ext_factor != 0
GGML_ASSERT(ext_factor == 0); GGML_ASSERT(ext_factor == 0);
// TODO: freq_scale != 1 // TODO: freq_scale != 1
GGML_ASSERT(freq_scale == 1); GGML_ASSERT(freq_scale == 1);
// TODO: type == GGML_TYPE_F16
GGML_ASSERT(src0->type == GGML_TYPE_F32);
const float theta_scale = powf(freq_base, -2.0f / n_dims); const float theta_scale = powf(freq_base, -2.0f / n_dims);
@ -2924,177 +3083,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor, aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
theta_scale, is_neox); theta_scale, is_neox);
// roll input uint64_t workspaceSize = 0;
void* input_roll_buffer; aclOpExecutor* executor;
aclTensor* acl_minus_one_tensor;
void* minus_one_scale_buffer = nullptr;
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
ggml_cann_pool_alloc minus_one_scale_allocator(
ctx.pool(), sizeof(float_t) * src0->ne[0]);
if (!is_neox) {
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
input_roll_buffer = roll_allocator.get();
int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
src0->ne[2], src0->ne[3]};
size_t input_roll_nb[GGML_MAX_DIMS];
input_roll_nb[0] = ggml_type_size(src0->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
}
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
input_roll_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
GGML_MAX_DIMS);
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
src0->data, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
GGML_MAX_DIMS);
int64_t shifts[] = {1}; void* workspaceAddr = nullptr;
int64_t dims[] = {3};
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
// init [-1, 1, -1, 1, ...] int acl_mode = mode;
minus_one_scale_buffer = minus_one_scale_allocator.get(); if (mode == 0) {
acl_mode = 1;
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
size_t minus_one_nb[GGML_MAX_DIMS];
minus_one_nb[0] = sizeof(float_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
}
acl_minus_one_tensor = aclnn_ones(
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
int64_t dim = 3;
int64_t* index = new int64_t[src0->ne[0]];
for (int i = 0; i < src0->ne[0]; i++) {
index[i] = i / 2 * 2;
}
int64_t index_num = src0->ne[0];
float value = -1;
aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
index_num, value);
} else {
// roll input: [q0,q1,q2,...] ->
// [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
input_roll_buffer = roll_allocator.get();
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
input_roll_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
int64_t shifts[] = {src0->ne[0] / 2};
int64_t dims[] = {3};
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
// init [-1, -1, -1, 1, 11...]
minus_one_scale_buffer = minus_one_scale_allocator.get();
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
size_t minus_one_nb[GGML_MAX_DIMS];
minus_one_nb[0] = sizeof(float_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
}
acl_minus_one_tensor = aclnn_ones(
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
// -1 * first half
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
size_t first_half_nb[GGML_MAX_DIMS];
first_half_nb[0] = sizeof(float_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
}
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
first_half_nb, GGML_MAX_DIMS);
bool inplace = true;
float scale = -1;
aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
} }
// TODO: n_dims < ne0 aclTensor* acl_x = ggml_cann_create_tensor(src0);
GGML_ASSERT(n_dims == src0->ne[0]);
// input * scale
ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
ggml_nbytes(src0));
void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
size_t input_nb[GGML_MAX_DIMS];
input_nb[0] = ggml_type_size(src0->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
}
aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
input_roll_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
acl_input_roll_mul_scale_tensor);
// output
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
aclTensor* acl_dst = ggml_cann_create_tensor(dst); aclTensor* acl_dst = ggml_cann_create_tensor(dst);
void* output_fp32_buffer; ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
if (src0->type == GGML_TYPE_F32) { acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor); if (workspaceSize > 0) {
aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor, ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
acl_sin_reshape_tensor); workspaceAddr = workspace_allocator.get();
aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
// TODO: ne0 != n_dims in mode2
} else if (src0->type == GGML_TYPE_F16) {
size_t input_fp32_nb[GGML_MAX_DIMS];
input_fp32_nb[0] = sizeof(float_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
}
ggml_cann_pool_alloc fp32_allocator1(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
void* input_fp32_buffer1 = fp32_allocator1.get();
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
input_fp32_nb, GGML_MAX_DIMS);
ggml_cann_pool_alloc fp32_allocator2(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
void* input_fp32_buffer2 = fp32_allocator2.get();
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
input_fp32_nb, GGML_MAX_DIMS);
ggml_cann_pool_alloc fp32_allocator(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
output_fp32_buffer = fp32_allocator.get();
aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
input_fp32_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
input_fp32_tensor2);
aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
output_fp32_tensor);
aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
} }
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
executor, ctx.stream()));
ACL_CHECK(aclDestroyTensor(acl_x));
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor)); ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_src0));
ACL_CHECK(aclDestroyTensor(acl_dst)); ACL_CHECK(aclDestroyTensor(acl_dst));
} }

View file

@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
struct ggml_backend_cann_context { struct ggml_backend_cann_context {
int32_t device; /**< Device ID. */ int32_t device; /**< Device ID. */
std::string name; /**< Name of the device. */ std::string name; /**< Name of the device. */
std::string description; /**< Description of the device. */
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
aclrtStream streams[GGML_CANN_MAX_STREAMS] = { aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
{nullptr}}; /**< Array of streams for the device. */
/** /**
* @brief Constructor for initializing the context with a given device. * @brief Constructor for initializing the context with a given device.
* @param device Device ID. * @param device Device ID.
*/ */
explicit ggml_backend_cann_context(int device) explicit ggml_backend_cann_context(int device)
: device(device), name("CANN" + std::to_string(device)) {} : device(device), name("CANN" + std::to_string(device)) {
ggml_cann_set_device(device);
description = aclrtGetSocName();
}
/** /**
* @brief Destructor for cleaning up resources. * @brief Destructor for cleaning up resources.

View file

@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
ACL_CHECK(aclrtMemGetAllocationGranularity( ACL_CHECK(aclrtMemGetAllocationGranularity(
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED, &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
&info.devices[id].vmm_granularity)); &info.devices[id].vmm_granularity));
size_t free, total;
ggml_backend_cann_get_device_memory(id, &free, &total);
info.devices[id].total_vram = free;
} }
// TODO: add more device info later. // TODO: add more device info later.
@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
* @return A pointer to the allocated buffer. * @return A pointer to the allocated buffer.
*/ */
void* alloc(size_t size, size_t* actual_size) override { void* alloc(size_t size, size_t* actual_size) override {
const size_t alignment = 128;
size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
#ifdef DEBUG_CANN_MALLOC #ifdef DEBUG_CANN_MALLOC
int nnz = 0; int nnz = 0;
size_t max_size = 0; size_t max_size = 0;
@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
return ptr; return ptr;
} }
void* ptr; void* ptr;
size_t look_ahead_size = (size_t)(1.05 * size);
look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
ggml_cann_set_device(device); ggml_cann_set_device(device);
ACL_CHECK( ACL_CHECK(
aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST)); aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
*actual_size = look_ahead_size; *actual_size = size;
pool_size += look_ahead_size; pool_size += size;
#ifdef DEBUG_CANN_MALLOC #ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO( GGML_LOG_INFO(
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, " "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
@ -296,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
/** /**
* @brief The maximum size of the virtual memory pool (32 GB). * @brief The maximum size of the virtual memory pool (32 GB).
*/ */
static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB size_t max_size;
/** /**
* @brief The device ID associated with this buffer pool. * @brief The device ID associated with this buffer pool.
@ -341,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
*/ */
explicit ggml_cann_pool_vmm(int device) explicit ggml_cann_pool_vmm(int device)
: device(device), : device(device),
granularity(ggml_cann_info().devices[device].vmm_granularity) {} granularity(ggml_cann_info().devices[device].vmm_granularity) {
auto dev = ggml_cann_info().devices[device];
granularity = dev.vmm_granularity;
max_size = dev.total_vram;
}
/** /**
* @brief Destructor to free all buffers in the virtual memory pool. * @brief Destructor to free all buffers in the virtual memory pool.
@ -370,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
// round up the allocation size to the alignment to ensure that all // round up the allocation size to the alignment to ensure that all
// allocations are aligned for all data types // allocations are aligned for all data types
const size_t alignment = 128; const size_t alignment = 128;
size = alignment * ((size + alignment - 1) / alignment); size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
size_t avail = pool_size - pool_used; size_t avail = pool_size - pool_used;
if (size > avail) { if (size > avail) {
// round up to the next multiple of the granularity // round up to the next multiple of the granularity
size_t reserve_size = size - avail; size_t reserve_size = size - avail;
reserve_size = reserve_size = GGML_PAD(reserve_size, granularity);
granularity * ((reserve_size + granularity - 1) / granularity);
GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE); GGML_ASSERT(pool_size + reserve_size <= max_size);
// allocate more physical memory // allocate more physical memory
aclrtPhysicalMemProp prop = {}; aclrtPhysicalMemProp prop = {};
@ -396,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
// reserve virtual address space (if not already reserved) // reserve virtual address space (if not already reserved)
if (pool_addr == 0) { if (pool_addr == 0) {
ACL_CHECK(aclrtReserveMemAddress( ACL_CHECK(aclrtReserveMemAddress(
&pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1)); &pool_addr, max_size, 0, NULL, 1));
} }
// map at the end of the pool // map at the end of the pool
@ -409,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
// add to the pool // add to the pool
pool_size += reserve_size; pool_size += reserve_size;
// GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB ( #ifdef DEBUG_CANN_MALLOC
// reserved %llu MB)\n", GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
// device, (unsigned long long) (pool_size/1024/1024), device, (unsigned long long) (pool_size/1024/1024),
// (unsigned long long) (reserve_size/1024/1024)); (unsigned long long) (reserve_size/1024/1024));
#endif
} }
GGML_ASSERT(pool_addr != 0); GGML_ASSERT(pool_addr != 0);
@ -457,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
*/ */
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device( std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
int device) { int device) {
// return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device)); return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
} }
@ -1130,10 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
static bool ggml_backend_cann_buffer_type_initialized = false; static bool ggml_backend_cann_buffer_type_initialized = false;
if (!ggml_backend_cann_buffer_type_initialized) { if (!ggml_backend_cann_buffer_type_initialized) {
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) { for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
ggml_backend_cann_buffer_types[i] = { ggml_backend_cann_buffer_types[i] = {
/* .iface = */ ggml_backend_cann_buffer_type_interface, /* .iface = */ ggml_backend_cann_buffer_type_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
/* .context = */ /* .context = */
new ggml_backend_cann_buffer_type_context{ new ggml_backend_cann_buffer_type_context{
i, "CANN" + std::to_string(i)}, i, "CANN" + std::to_string(i)},
@ -1199,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) {
return nullptr; return nullptr;
} }
const size_t alignment = 128;
size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
void * hostPtr = nullptr; void * hostPtr = nullptr;
aclError err = aclrtMallocHost((void **) &hostPtr, size); aclError err = aclrtMallocHost((void **) &hostPtr, size);
if (err != ACL_SUCCESS) { if (err != ACL_SUCCESS) {
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, aclGetRecentErrMsg()); size / 1024.0 / 1024.0, aclGetRecentErrMsg());
return nullptr; return nullptr;
@ -1669,12 +1687,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
} }
case GGML_OP_MUL_MAT: { case GGML_OP_MUL_MAT: {
switch (op->src[0]->type) { switch (op->src[0]->type) {
case GGML_TYPE_Q8_0:
// Current groupsize should not be greater than k-1 in
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
if (op->src[0]->ne[0] <= QK8_0) {
return false;
}
case GGML_TYPE_F16: case GGML_TYPE_F16:
case GGML_TYPE_F32: case GGML_TYPE_F32:
case GGML_TYPE_Q8_0:
// TODO: fix me
// Current groupsize should not be greater than k-1 in
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
return true; return true;
default: default:
@ -1706,9 +1726,61 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
return false; return false;
} }
} }
case GGML_OP_CONT: {
// TODO: support GGML_TYPE_BF16
switch (op->src[0]->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
return true;
default:
return false;
}
}
case GGML_OP_ROPE: {
// TODO: with ops-test v == 1
float * freq_scale = (float*)((int32_t*)op->op_params + 6);
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
float * attn_factor = (float*)((int32_t*)op->op_params + 8);
// TODO: with freq_factors
if (op->src[2] != NULL) {
return false;
}
// TODO: n_dims <= ne0
if (op->src[0]->ne[0] != op->op_params[1]) {
return false;
}
// TODO: ext_factor != 0
if (*ext_factor != 0) {
return false;
}
// TODO: freq_scale != 1
if (*freq_scale != 1) {
return false;
}
// TODO: attn_factor != 1
if (*attn_factor != 1) {
return false;
}
//TODO: type == GGML_TYPE_F16
switch (op->src[0]->type) {
case GGML_TYPE_F32:
return true;
default:
return false;
}
}
case GGML_OP_UPSCALE: {
// aclnnUpsampleNearest2dGetWorkspaceSize not support
// selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
return false;
}
return true;
}
case GGML_OP_IM2COL:
case GGML_OP_CONCAT:
case GGML_OP_DUP: case GGML_OP_DUP:
case GGML_OP_REPEAT: case GGML_OP_REPEAT:
case GGML_OP_CONCAT:
case GGML_OP_NONE: case GGML_OP_NONE:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_VIEW:
@ -1722,17 +1794,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
case GGML_OP_IM2COL:
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_ACC: case GGML_OP_ACC:
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_ARANGE: case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:

View file

@ -96,6 +96,39 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
endif () endif ()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
elseif (APPLE)
if (GGML_NATIVE)
set(USER_PROVIDED_MARCH FALSE)
foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
set(USER_PROVIDED_MARCH TRUE)
break()
endif()
endforeach()
if (NOT USER_PROVIDED_MARCH)
set(MARCH_FLAGS "-march=armv8.2a")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD)
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
add_compile_definitions(__ARM_FEATURE_DOTPROD)
endif ()
set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
endif ()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
endif ()
endif ()
else() else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")

View file

@ -474,9 +474,15 @@ void write_output_files() {
int main(int argc, char** argv) { int main(int argc, char** argv) {
std::map<std::string, std::string> args; std::map<std::string, std::string> args;
for (int i = 1; i < argc; i += 2) { for (int i = 1; i < argc; ++i) {
if (i + 1 < argc) { std::string arg = argv[i];
args[argv[i]] = argv[i + 1]; if (arg.rfind("--", 0) == 0) {
if (i + 1 < argc && argv[i + 1][0] != '-') {
args[arg] = argv[i + 1];
++i;
} else {
args[arg] = "";
}
} }
} }

25
include/llama-cpp.h Normal file
View file

@ -0,0 +1,25 @@
#pragma once
#ifndef __cplusplus
#error "This header is for C++ only"
#endif
#include <memory>
#include "llama.h"
struct llama_model_deleter {
void operator()(llama_model * model) { llama_free_model(model); }
};
struct llama_context_deleter {
void operator()(llama_context * context) { llama_free(context); }
};
struct llama_sampler_deleter {
void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
};
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;