llama : refactor sampling v2 (#9294)
- Add `struct llama_sampler` and `struct llama_sampler_i` - Add `llama_sampler_` API - Add `llama_sampler_chain_` API for chaining multiple samplers - Remove `LLAMA_API_INTERNAL` - Add `llama_perf_` API and remove old `llama_print_timings` and `llama_reset_timings`
This commit is contained in:
parent
947538acb8
commit
df270ef745
48 changed files with 3497 additions and 2914 deletions
129
src/llama-impl.h
129
src/llama-impl.h
|
@ -1,8 +1,11 @@
|
|||
#pragma once
|
||||
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "llama.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
|
@ -29,6 +32,20 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
|
|||
// helpers
|
||||
//
|
||||
|
||||
struct time_meas {
|
||||
time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
|
||||
|
||||
~time_meas() {
|
||||
if (t_start_us >= 0) {
|
||||
t_acc += ggml_time_us() - t_start_us;
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t t_start_us;
|
||||
|
||||
int64_t & t_acc;
|
||||
};
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
if (search.empty()) {
|
||||
return;
|
||||
|
@ -45,3 +62,113 @@ static void replace_all(std::string & s, const std::string & search, const std::
|
|||
builder.append(s, last_pos, std::string::npos);
|
||||
s = std::move(builder);
|
||||
}
|
||||
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
||||
struct llama_context * ctx
|
||||
);
|
||||
|
||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||
template<typename T>
|
||||
struct ring_buffer {
|
||||
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
||||
|
||||
T & front() {
|
||||
if (sz == 0) {
|
||||
throw std::runtime_error("ring buffer is empty");
|
||||
}
|
||||
return data[first];
|
||||
}
|
||||
|
||||
const T & front() const {
|
||||
if (sz == 0) {
|
||||
throw std::runtime_error("ring buffer is empty");
|
||||
}
|
||||
return data[first];
|
||||
}
|
||||
|
||||
T & back() {
|
||||
if (sz == 0) {
|
||||
throw std::runtime_error("ring buffer is empty");
|
||||
}
|
||||
return data[pos];
|
||||
}
|
||||
|
||||
const T & back() const {
|
||||
if (sz == 0) {
|
||||
throw std::runtime_error("ring buffer is empty");
|
||||
}
|
||||
return data[pos];
|
||||
}
|
||||
|
||||
void push_back(const T & value) {
|
||||
if (sz == capacity) {
|
||||
// advance the start when buffer is full
|
||||
first = (first + 1) % capacity;
|
||||
} else {
|
||||
sz++;
|
||||
}
|
||||
data[pos] = value;
|
||||
pos = (pos + 1) % capacity;
|
||||
}
|
||||
|
||||
T pop_front() {
|
||||
if (sz == 0) {
|
||||
throw std::runtime_error("ring buffer is empty");
|
||||
}
|
||||
T value = data[first];
|
||||
first = (first + 1) % capacity;
|
||||
sz--;
|
||||
return value;
|
||||
}
|
||||
|
||||
//T & operator[](size_t i) {
|
||||
// if (i >= sz) {
|
||||
// throw std::runtime_error("ring buffer: index out of bounds");
|
||||
// }
|
||||
// return data[(first + i) % capacity];
|
||||
//}
|
||||
|
||||
//const T & at(size_t i) const {
|
||||
// if (i >= sz) {
|
||||
// throw std::runtime_error("ring buffer: index out of bounds");
|
||||
// }
|
||||
// return data[(first + i) % capacity];
|
||||
//}
|
||||
|
||||
const T & rat(size_t i) const {
|
||||
if (i >= sz) {
|
||||
throw std::runtime_error("ring buffer: index out of bounds");
|
||||
}
|
||||
return data[(first + sz - i - 1) % capacity];
|
||||
}
|
||||
|
||||
std::vector<T> to_vector() const {
|
||||
std::vector<T> result;
|
||||
result.reserve(sz);
|
||||
for (size_t i = 0; i < sz; i++) {
|
||||
result.push_back(data[(first + i) % capacity]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void clear() {
|
||||
// here only reset the status of the buffer
|
||||
sz = 0;
|
||||
first = 0;
|
||||
pos = 0;
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return sz == 0;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return sz;
|
||||
}
|
||||
|
||||
size_t capacity = 0;
|
||||
size_t sz = 0;
|
||||
size_t first = 0;
|
||||
size_t pos = 0;
|
||||
std::vector<T> data;
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue