llama : refactor sampling v2 (#9294)

- Add `struct llama_sampler` and `struct llama_sampler_i`
- Add `llama_sampler_` API
- Add `llama_sampler_chain_` API for chaining multiple samplers
- Remove `LLAMA_API_INTERNAL`
- Add `llama_perf_` API and remove old `llama_print_timings` and `llama_reset_timings`
This commit is contained in:
Georgi Gerganov 2024-09-07 15:16:19 +03:00 committed by GitHub
parent 947538acb8
commit df270ef745
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
48 changed files with 3497 additions and 2914 deletions

View file

@ -1,8 +1,11 @@
#pragma once
#define LLAMA_API_INTERNAL
#include "llama.h"
#include <string>
#include <vector>
#include <stdexcept>
#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
@ -29,6 +32,20 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
// helpers
//
struct time_meas {
time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
~time_meas() {
if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us;
}
}
const int64_t t_start_us;
int64_t & t_acc;
};
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return;
@ -45,3 +62,113 @@ static void replace_all(std::string & s, const std::string & search, const std::
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
// the ring buffer works similarly to std::deque, but with a fixed capacity
template<typename T>
struct ring_buffer {
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
T & front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
const T & front() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
T & back() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
const T & back() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
void push_back(const T & value) {
if (sz == capacity) {
// advance the start when buffer is full
first = (first + 1) % capacity;
} else {
sz++;
}
data[pos] = value;
pos = (pos + 1) % capacity;
}
T pop_front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
T value = data[first];
first = (first + 1) % capacity;
sz--;
return value;
}
//T & operator[](size_t i) {
// if (i >= sz) {
// throw std::runtime_error("ring buffer: index out of bounds");
// }
// return data[(first + i) % capacity];
//}
//const T & at(size_t i) const {
// if (i >= sz) {
// throw std::runtime_error("ring buffer: index out of bounds");
// }
// return data[(first + i) % capacity];
//}
const T & rat(size_t i) const {
if (i >= sz) {
throw std::runtime_error("ring buffer: index out of bounds");
}
return data[(first + sz - i - 1) % capacity];
}
std::vector<T> to_vector() const {
std::vector<T> result;
result.reserve(sz);
for (size_t i = 0; i < sz; i++) {
result.push_back(data[(first + i) % capacity]);
}
return result;
}
void clear() {
// here only reset the status of the buffer
sz = 0;
first = 0;
pos = 0;
}
bool empty() const {
return sz == 0;
}
size_t size() const {
return sz;
}
size_t capacity = 0;
size_t sz = 0;
size_t first = 0;
size_t pos = 0;
std::vector<T> data;
};