Merge branch 'master' into optimize-convert

This commit is contained in:
tpoisonooo 2023-03-18 18:41:28 +08:00 committed by GitHub
commit a44ccef6ac
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 522 additions and 42 deletions

17
.devops/full.Dockerfile Normal file
View file

@ -0,0 +1,17 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip
RUN pip install --upgrade pip setuptools wheel \
&& pip install torch torchvision torchaudio sentencepiece numpy
WORKDIR /app
COPY . .
RUN make
ENTRYPOINT ["/app/.devops/tools.sh"]

18
.devops/main.Dockerfile Normal file
View file

@ -0,0 +1,18 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential
WORKDIR /app
COPY . .
RUN make
FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/main /main
ENTRYPOINT [ "/main" ]

46
.devops/tools.sh Executable file
View file

@ -0,0 +1,46 @@
#!/bin/bash
set -e
# Read the first argument into a variable
arg1="$1"
# Shift the arguments to remove the first one
shift
# Join the remaining arguments into a single string
arg2="$@"
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
python3 ./convert-pth-to-ggml.py $arg2
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
./quantize $arg2
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
./main $arg2
elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
python3 ./download-pth.py $arg2
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
echo "Downloading model..."
python3 ./download-pth.py "$1" "$2"
echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do
if [ -f "${i/f16/q4_0}" ]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./quantize "$i" "${i/f16/q4_0}" 2
fi
done
else
echo "Unknown command: $arg1"
echo "Available commands: "
echo " --run (-r): Run a model previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --convert (-c): Convert a llama model into ggml"
echo " ex: \"/models/7B/\" 1"
echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
echo " ex: \"/models/\" 7B"
echo " --all-in-one (-a): Execute --download, --convert & --quantize"
echo " ex: \"/models/\" 7B"
fi

24
.dockerignore Normal file
View file

@ -0,0 +1,24 @@
*.o
*.a
.cache/
.vs/
.vscode/
.DS_Store
build/
build-em/
build-debug/
build-release/
build-static/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
models/*
/main
/quantize
arm_neon.h
compile_commands.json
Dockerfile

View file

@ -1,8 +1,42 @@
name: CI name: CI
on: [push, pull_request]
on:
workflow_dispatch: # allows manual triggering
inputs:
create_release:
description: 'Create new release'
required: true
type: boolean
push:
paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
pull_request:
types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
jobs: jobs:
ubuntu-latest: ubuntu-latest-make:
runs-on: ubuntu-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: make_build
run: |
make
ubuntu-latest-cmake:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
@ -15,10 +49,31 @@ jobs:
sudo apt-get install build-essential sudo apt-get install build-essential
- name: Build - name: Build
run: |
mkdir build
cd build
cmake ..
cmake --build . --config Release
macOS-latest-make:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
- name: Dependencies
id: depends
run: |
brew update
- name: Build
id: make_build
run: | run: |
make make
macOS-latest: macOS-latest-cmake:
runs-on: macOS-latest runs-on: macOS-latest
steps: steps:
@ -31,22 +86,59 @@ jobs:
- name: Build - name: Build
run: | run: |
make mkdir build
cd build
cmake ..
cmake --build . --config Release
windows-latest: windows-latest-cmake:
runs-on: windows-latest runs-on: windows-latest
steps: steps:
- name: Clone - name: Clone
id: checkout
uses: actions/checkout@v1 uses: actions/checkout@v1
- name: Build - name: Build
id: cmake_build
run: | run: |
mkdir build mkdir build
cd build cd build
cmake .. cmake ..
cmake --build . --config Release cmake --build . --config Release
- name: Get commit hash
id: commit
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\*
- name: Create release
id: create_release
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: zendesk/action-create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
- name: Upload release
id: upload_release
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
asset_content_type: application/octet-stream
# ubuntu-latest-gcc: # ubuntu-latest-gcc:
# runs-on: ubuntu-latest # runs-on: ubuntu-latest
# #

61
.github/workflows/docker.yml vendored Normal file
View file

@ -0,0 +1,61 @@
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.
# GitHub recommends pinning actions to a commit SHA.
# To get a newer version, you will need to update the SHA.
# You can also reference a tag or branch, but the action may change without warning.
name: Publish Docker image
on:
pull_request:
push:
branches:
- master
jobs:
push_to_registry:
name: Push Docker image to Docker Hub
runs-on: ubuntu-latest
env:
COMMIT_SHA: ${{ github.sha }}
strategy:
matrix:
config:
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
steps:
- name: Check out the repo
uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Log in to Docker Hub
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image (versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
with:
context: .
push: true
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
uses: docker/build-push-action@v4
with:
context: .
push: ${{ github.event_name == 'push' }}
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}

4
.gitignore vendored
View file

@ -18,6 +18,10 @@ models/*
/main /main
/quantize /quantize
/result
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json
.envrc
.direnv/

View file

@ -4,6 +4,8 @@ project("llama.cpp")
set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD 11)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
@ -126,3 +128,4 @@ target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
target_include_directories(ggml PUBLIC .) target_include_directories(ggml PUBLIC .)
target_link_libraries(quantize PRIVATE ggml) target_link_libraries(quantize PRIVATE ggml)
target_link_libraries(llama PRIVATE ggml) target_link_libraries(llama PRIVATE ggml)
target_link_libraries(ggml PRIVATE Threads::Threads)

View file

@ -32,13 +32,14 @@ Supported platforms:
- [X] Mac OS - [X] Mac OS
- [X] Linux - [X] Linux
- [X] Windows (via CMake) - [X] Windows (via CMake)
- [X] Docker
--- ---
Here is a typical run using LLaMA-7B: Here is a typical run using LLaMA-7B:
```java ```java
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512 make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
I llama.cpp build info: I llama.cpp build info:
I UNAME_S: Darwin I UNAME_S: Darwin
I UNAME_P: arm I UNAME_P: arm
@ -149,7 +150,7 @@ python3 convert-pth-to-ggml.py models/7B/ 1
./quantize.sh 7B ./quantize.sh 7B
# run the inference # run the inference
./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
``` ```
When running the larger models, make sure you have enough disk space to store all the intermediate files. When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -163,7 +164,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
Here is an example few-shot interaction, invoked with the command Here is an example few-shot interaction, invoked with the command
``` ```
./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \ ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
-p \ -p \
"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
@ -194,6 +195,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
### Docker
#### Prerequisites
* Docker must be installed and running on your system.
* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
#### Images
We have two Docker images available for this project:
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
#### Usage
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
```
On complete, you are ready to play!
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
```
or with light image:
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
```
## Limitations ## Limitations
@ -210,6 +242,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions - Collaborators will be invited based on contributions
- Any help with managing issues and PRs is very appreciated! - Any help with managing issues and PRs is very appreciated!
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
### Coding guidelines ### Coding guidelines

View file

@ -16,7 +16,7 @@
# At the start of the ggml file we write the model parameters # At the start of the ggml file we write the model parameters
# and vocabulary. # and vocabulary.
# #
import os
import sys import sys
import json import json
import struct import struct
@ -59,7 +59,6 @@ def get_n_parts(dim):
print("Invalid dim: " + str(dim)) print("Invalid dim: " + str(dim))
sys.exit(1) sys.exit(1)
def main(): def main():
args = parse_args() args = parse_args()
dir_model = args.dir_model dir_model = args.dir_model

66
download-pth.py Normal file
View file

@ -0,0 +1,66 @@
import os
import sys
from tqdm import tqdm
import requests
if len(sys.argv) < 3:
print("Usage: download-pth.py dir-model model-type\n")
print(" model-type: Available models 7B, 13B, 30B or 65B")
sys.exit(1)
modelsDir = sys.argv[1]
model = sys.argv[2]
num = {
"7B": 1,
"13B": 2,
"30B": 4,
"65B": 8,
}
if model not in num:
print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
sys.exit(1)
print(f"Downloading model {model}")
files = ["checklist.chk", "params.json"]
for i in range(num[model]):
files.append(f"consolidated.0{i}.pth")
resolved_path = os.path.abspath(os.path.join(modelsDir, model))
os.makedirs(resolved_path, exist_ok=True)
for file in files:
dest_path = os.path.join(resolved_path, file)
if os.path.exists(dest_path):
print(f"Skip file download, it already exists: {file}")
continue
url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
response = requests.get(url, stream=True)
with open(dest_path, 'wb') as f:
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
t.update(len(chunk))
files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
for file in files2:
dest_path = os.path.join(modelsDir, file)
if os.path.exists(dest_path):
print(f"Skip file download, it already exists: {file}")
continue
url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
response = requests.get(url, stream=True)
with open(dest_path, 'wb') as f:
with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
t.update(len(chunk))

43
flake.lock generated Normal file
View file

@ -0,0 +1,43 @@
{
"nodes": {
"flake-utils": {
"locked": {
"lastModified": 1676283394,
"narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1678470307,
"narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}

48
flake.nix Normal file
View file

@ -0,0 +1,48 @@
{
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
flake-utils.url = "github:numtide/flake-utils";
};
outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = import nixpkgs {
inherit system;
};
llama-python = pkgs.python310.withPackages (ps: with ps; [
torch
numpy
sentencepiece
]);
in
{
packages.default = pkgs.stdenv.mkDerivation {
name = "llama.cpp";
src = ./.;
nativeBuildInputs = with pkgs; [ cmake ];
buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
darwin.apple_sdk.frameworks.Accelerate
];
cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
];
installPhase = ''
mkdir -p $out/bin
mv llama $out/bin/llama
mv quantize $out/bin/quantize
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
chmod +x $out/bin/convert-pth-to-ggml
'';
};
devShells.default = pkgs.mkShell {
packages = with pkgs; [
cmake
llama-python
] ++ lib.optionals stdenv.isDarwin [
darwin.apple_sdk.frameworks.Accelerate
];
};
}
);
}

4
ggml.c
View file

@ -9318,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
} }
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
if (cgraph->n_threads <= 0) {
cgraph->n_threads = 8;
}
const int n_threads = cgraph->n_threads; const int n_threads = cgraph->n_threads;
struct ggml_compute_state_shared state_shared = { struct ggml_compute_state_shared state_shared = {

View file

@ -845,6 +845,8 @@ int main(int argc, char ** argv) {
std::vector<float> logits; std::vector<float> logits;
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
// tokenize the prompt // tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true); std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);

View file

@ -16,6 +16,18 @@
#endif #endif
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
// determine sensible default number of threads.
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
#ifdef __linux__
std::ifstream cpuinfo("/proc/cpuinfo");
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
std::istream_iterator<std::string>(),
std::string("processor"));
#endif
if (params.n_threads == 0) {
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
}
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
std::string arg = argv[i]; std::string arg = argv[i];
@ -275,40 +287,56 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
return tokens; return tokens;
} }
// TODO: Calculate this constant from the vocabulary
#define MAX_TOKEN_LEN 18
// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
//auto res = gpt_tokenize(vocab, text);
//if (bos) {
// res.insert(res.begin(), 1); // TODO: replace with vocab.bos
//}
std::vector<gpt_vocab::id> res; std::vector<gpt_vocab::id> res;
std::vector<int> score;
std::vector<gpt_vocab::id> prev;
int len = text.length();
score.resize(len + 1);
prev.resize(len + 1);
// Forward pass
for (int i = 0; i < len; i++) {
int max_len = std::min(len - i, MAX_TOKEN_LEN);
for (int sub_len = 1; sub_len <= len - i; sub_len++) {
auto sub = text.substr(i, sub_len);
auto token = vocab.token_to_id.find(sub);
if (token != vocab.token_to_id.end()) {
int token_score = sub.length() * sub.length();
int local_score = score[i] + token_score;
int next = i + sub_len;
if (score[next] < local_score) {
score[next] = local_score;
prev[next] = (*token).second;
}
}
}
}
// Backward pass
int i = len;
while (i > 0) {
gpt_vocab::id token_id = prev[i];
if (token_id == 0) {
// TODO: Return error or something more meaningful
printf("failed to tokenize string!\n");
break;
}
res.push_back(token_id);
auto token = (*vocab.id_to_token.find(token_id)).second;
i -= token.length();
}
if (bos) { if (bos) {
res.push_back(1); // TODO: replace with vocab.bos res.push_back(1); // TODO: replace with vocab.bos
} }
//find the longest token that matches the text // Pieces are in reverse order so correct that
int pos = 0; std::reverse(res.begin(), res.end());
while (true) {
int l = 0;
int t = 0;
for (const auto & kv : vocab.id_to_token) {
if (kv.second.size() < l) continue;
if (kv.second.size() > text.size() - pos) continue;
if (text.substr(pos, kv.second.size()) == kv.second) {
l = kv.second.size();
t = kv.first;
}
}
if (l == 0) {
break;
}
res.push_back(t);
pos += l;
}
return res; return res;
} }

View file

@ -18,7 +18,7 @@ struct gpt_params {
int32_t n_predict = 128; // new tokens to predict int32_t n_predict = 128; // new tokens to predict
int32_t repeat_last_n = 64; // last n tokens to penalize int32_t repeat_last_n = 64; // last n tokens to penalize
int32_t n_ctx = 512; //context size int32_t n_ctx = 512; //context size
// sampling parameters // sampling parameters
int32_t top_k = 40; int32_t top_k = 40;
float top_p = 0.95f; float top_p = 0.95f;