Merge branch 'ggerganov:master' into k-shift2
This commit is contained in:
commit
e5ce8b412c
11 changed files with 440 additions and 253 deletions
6
Makefile
6
Makefile
|
@ -34,6 +34,7 @@ BUILD_TARGETS = \
|
|||
llama-save-load-state \
|
||||
llama-server \
|
||||
llama-simple \
|
||||
llama-simple-chat \
|
||||
llama-speculative \
|
||||
llama-tokenize \
|
||||
llama-vdot \
|
||||
|
@ -1287,6 +1288,11 @@ llama-simple: examples/simple/simple.cpp \
|
|||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-simple-chat: examples/simple-chat/simple-chat.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-tokenize: examples/tokenize/tokenize.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
|
|
|
@ -49,6 +49,7 @@ else()
|
|||
endif()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(tokenize)
|
||||
endif()
|
||||
|
|
5
examples/simple-chat/CMakeLists.txt
Normal file
5
examples/simple-chat/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
set(TARGET llama-simple-chat)
|
||||
add_executable(${TARGET} simple-chat.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
7
examples/simple-chat/README.md
Normal file
7
examples/simple-chat/README.md
Normal file
|
@ -0,0 +1,7 @@
|
|||
# llama.cpp/example/simple-chat
|
||||
|
||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp to create a simple chat program using the chat template from the GGUF file.
|
||||
|
||||
```bash
|
||||
./llama-simple-chat -m Meta-Llama-3.1-8B-Instruct.gguf -c 2048
|
||||
...
|
197
examples/simple-chat/simple-chat.cpp
Normal file
197
examples/simple-chat/simple-chat.cpp
Normal file
|
@ -0,0 +1,197 @@
|
|||
#include "llama.h"
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
printf("\nexample usage:\n");
|
||||
printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::string model_path;
|
||||
int ngl = 99;
|
||||
int n_ctx = 2048;
|
||||
|
||||
// parse command line arguments
|
||||
for (int i = 1; i < argc; i++) {
|
||||
try {
|
||||
if (strcmp(argv[i], "-m") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
model_path = argv[++i];
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "-c") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
n_ctx = std::stoi(argv[++i]);
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else if (strcmp(argv[i], "-ngl") == 0) {
|
||||
if (i + 1 < argc) {
|
||||
ngl = std::stoi(argv[++i]);
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
} catch (std::exception & e) {
|
||||
fprintf(stderr, "error: %s\n", e.what());
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (model_path.empty()) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// only print errors
|
||||
llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
|
||||
if (level >= GGML_LOG_LEVEL_ERROR) {
|
||||
fprintf(stderr, "%s", text);
|
||||
}
|
||||
}, nullptr);
|
||||
|
||||
// initialize the model
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = ngl;
|
||||
|
||||
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
||||
if (!model) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// initialize the context
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = n_ctx;
|
||||
ctx_params.n_batch = n_ctx;
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
if (!ctx) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// initialize the sampler
|
||||
llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
||||
|
||||
// helper function to evaluate a prompt and generate a response
|
||||
auto generate = [&](const std::string & prompt) {
|
||||
std::string response;
|
||||
|
||||
// tokenize the prompt
|
||||
const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
||||
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
|
||||
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
|
||||
GGML_ABORT("failed to tokenize the prompt\n");
|
||||
}
|
||||
|
||||
// prepare a batch for the prompt
|
||||
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
||||
llama_token new_token_id;
|
||||
while (true) {
|
||||
// check if we have enough space in the context to evaluate this batch
|
||||
int n_ctx = llama_n_ctx(ctx);
|
||||
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch)) {
|
||||
GGML_ABORT("failed to decode\n");
|
||||
}
|
||||
|
||||
// sample the next token
|
||||
new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
||||
|
||||
// is it an end of generation?
|
||||
if (llama_token_is_eog(model, new_token_id)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// convert the token to a string, print it and add it to the response
|
||||
char buf[256];
|
||||
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
|
||||
if (n < 0) {
|
||||
GGML_ABORT("failed to convert token to piece\n");
|
||||
}
|
||||
std::string piece(buf, n);
|
||||
printf("%s", piece.c_str());
|
||||
fflush(stdout);
|
||||
response += piece;
|
||||
|
||||
// prepare the next batch with the sampled token
|
||||
batch = llama_batch_get_one(&new_token_id, 1);
|
||||
}
|
||||
|
||||
return response;
|
||||
};
|
||||
|
||||
std::vector<llama_chat_message> messages;
|
||||
std::vector<char> formatted(llama_n_ctx(ctx));
|
||||
int prev_len = 0;
|
||||
while (true) {
|
||||
// get user input
|
||||
printf("\033[32m> \033[0m");
|
||||
std::string user;
|
||||
std::getline(std::cin, user);
|
||||
|
||||
if (user.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
// add the user input to the message list and format it
|
||||
messages.push_back({"user", strdup(user.c_str())});
|
||||
int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
||||
if (new_len > (int)formatted.size()) {
|
||||
formatted.resize(new_len);
|
||||
new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
||||
}
|
||||
if (new_len < 0) {
|
||||
fprintf(stderr, "failed to apply the chat template\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// remove previous messages to obtain the prompt to generate the response
|
||||
std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
|
||||
|
||||
// generate a response
|
||||
printf("\033[33m");
|
||||
std::string response = generate(prompt);
|
||||
printf("\n\033[0m");
|
||||
|
||||
// add the response to the messages
|
||||
messages.push_back({"assistant", strdup(response.c_str())});
|
||||
prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
|
||||
if (prev_len < 0) {
|
||||
fprintf(stderr, "failed to apply the chat template\n");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// free resources
|
||||
for (auto & msg : messages) {
|
||||
free(const_cast<char *>(msg.content));
|
||||
}
|
||||
llama_sampler_free(smpl);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
38
ggml/include/ggml-cpp.h
Normal file
38
ggml/include/ggml-cpp.h
Normal file
|
@ -0,0 +1,38 @@
|
|||
#pragma once
|
||||
|
||||
#ifndef __cplusplus
|
||||
#error "This header is for C++ only"
|
||||
#endif
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include <memory>
|
||||
|
||||
// Smart pointers for ggml types
|
||||
|
||||
// ggml
|
||||
|
||||
struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
|
||||
struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
|
||||
typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
|
||||
|
||||
// ggml-alloc
|
||||
|
||||
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
|
||||
|
||||
// ggml-backend
|
||||
|
||||
struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } };
|
||||
struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
|
||||
struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } };
|
||||
struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } };
|
||||
|
||||
typedef std::unique_ptr<ggml_backend, ggml_backend_deleter> ggml_backend_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_event, ggml_backend_event_deleter> ggml_backend_event_ptr;
|
||||
typedef std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter> ggml_backend_sched_ptr;
|
|
@ -558,10 +558,10 @@ extern "C" {
|
|||
|
||||
enum ggml_log_level {
|
||||
GGML_LOG_LEVEL_NONE = 0,
|
||||
GGML_LOG_LEVEL_INFO = 1,
|
||||
GGML_LOG_LEVEL_WARN = 2,
|
||||
GGML_LOG_LEVEL_ERROR = 3,
|
||||
GGML_LOG_LEVEL_DEBUG = 4,
|
||||
GGML_LOG_LEVEL_DEBUG = 1,
|
||||
GGML_LOG_LEVEL_INFO = 2,
|
||||
GGML_LOG_LEVEL_WARN = 3,
|
||||
GGML_LOG_LEVEL_ERROR = 4,
|
||||
GGML_LOG_LEVEL_CONT = 5, // continue previous log
|
||||
};
|
||||
|
||||
|
|
|
@ -1368,6 +1368,7 @@ add_library(ggml
|
|||
../include/ggml.h
|
||||
../include/ggml-alloc.h
|
||||
../include/ggml-backend.h
|
||||
../include/ggml-cpp.h
|
||||
ggml.c
|
||||
ggml-alloc.c
|
||||
ggml-backend.cpp
|
||||
|
|
|
@ -1047,7 +1047,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
|||
return buf;
|
||||
}
|
||||
|
||||
buf->size = size;
|
||||
vk::BufferCreateInfo buffer_create_info{
|
||||
vk::BufferCreateFlags(),
|
||||
size,
|
||||
|
@ -1075,7 +1074,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
|||
|
||||
if (memory_type_index == UINT32_MAX) {
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
buf->size = 0;
|
||||
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
||||
}
|
||||
|
||||
|
@ -1092,13 +1090,11 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
|||
}
|
||||
catch (const vk::SystemError& e) {
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
buf->size = 0;
|
||||
throw e;
|
||||
}
|
||||
} else {
|
||||
// Out of Host/Device memory, clean up buffer
|
||||
device->device.destroyBuffer(buf->buffer);
|
||||
buf->size = 0;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
@ -1111,6 +1107,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
|||
device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
|
||||
|
||||
buf->device = device;
|
||||
buf->size = size;
|
||||
|
||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||
device->memory_logger->log_allocation(buf, size);
|
||||
|
|
1
spm-headers/ggml-cpp.h
Symbolic link
1
spm-headers/ggml-cpp.h
Symbolic link
|
@ -0,0 +1 @@
|
|||
../ggml/include/ggml-cpp.h
|
424
src/llama.cpp
424
src/llama.cpp
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue