llama : minor stuff

This commit is contained in:
Georgi Gerganov 2023-12-29 19:32:30 +02:00
parent d24da31d2f
commit 128c213ab5
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 21 additions and 23 deletions

View file

@ -78,7 +78,6 @@
#include <thread>
#include <type_traits>
#include <unordered_map>
#include <iostream>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@ -7006,9 +7005,9 @@ struct llm_tokenizer_bpe {
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
int final_prev_index = -1;
std::vector<std::string> word_collection;
switch (vocab.type)
{
switch (vocab.type) {
case LLAMA_VOCAB_TYPE_BPE:
word_collection = bpe_gpt2_preprocess(text);
break;

View file

@ -70,8 +70,8 @@ extern "C" {
enum llama_vocab_type {
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // deepseek coder
LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 3, // deepseek coder
LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // Deepseek Coder
LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 3, // Deepseek LLM
};
enum llama_token_type {

View file

@ -7,7 +7,6 @@
#include <locale>
#include <codecvt>
#include <string>
#include <cstring>
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},