Merge remote-tracking branch 'origin/master' into batch_perplexity

This commit is contained in:
Gary Linscott 2023-04-13 08:13:09 -07:00
commit fbcecd59a9
42 changed files with 2874 additions and 1150 deletions

View file

@ -15,4 +15,4 @@ FROM ubuntu:$UBUNTU_VERSION as runtime
COPY --from=build /app/main /main
ENTRYPOINT [ "/main" ]
ENTRYPOINT [ "/main" ]

View file

@ -21,4 +21,4 @@ models/*
arm_neon.h
compile_commands.json
Dockerfile
Dockerfile

5
.ecrc Normal file
View file

@ -0,0 +1,5 @@
{
"Disable": {
"IndentSize": true
}
}

19
.editorconfig Normal file
View file

@ -0,0 +1,19 @@
# https://EditorConfig.org
# Top-most EditorConfig file
root = true
# Unix-style newlines with a newline ending every file, utf-8 charset
[*]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
charset = utf-8
indent_style = space
indent_size = 4
[Makefile]
indent_style = tab
[prompts/*.txt]
insert_final_newline = unset

View file

@ -22,9 +22,9 @@ Please provide a detailed written description of what you were trying to do, and
# Current Behavior
Please provide a detailed written description of what `llama.cpp` did, instead.
Please provide a detailed written description of what `llama.cpp` did, instead.
# Environment and Context
# Environment and Context
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
@ -133,7 +133,7 @@ llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.
llama_model_load: .......................................................................................... done
llama_model_load: model size = 4869.09 MB / num tensors = 723
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
main: prompt: 'Please close your issue when it has been answered.'
main: number of tokens in prompt = 11
@ -166,14 +166,14 @@ main: total time = 246406.42 ms
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
3636882.89 msec task-clock # 14.677 CPUs utilized
13509 context-switches # 3.714 /sec
2436 cpu-migrations # 0.670 /sec
10476679 page-faults # 2.881 K/sec
3636882.89 msec task-clock # 14.677 CPUs utilized
13509 context-switches # 3.714 /sec
2436 cpu-migrations # 0.670 /sec
10476679 page-faults # 2.881 K/sec
13133115082869 cycles # 3.611 GHz (16.77%)
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
23479217109614 instructions # 1.79 insn per cycle
23479217109614 instructions # 1.79 insn per cycle
# 0.44 stalled cycles per insn (16.76%)
2353072268027 branches # 647.002 M/sec (16.77%)
1998682780 branch-misses # 0.08% of all branches (16.76%)

View file

@ -60,4 +60,4 @@ jobs:
push: ${{ github.event_name == 'push' }}
platforms: linux/amd64,linux/arm64
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}
file: ${{ matrix.config.dockerfile }}

17
.github/workflows/editorconfig.yml vendored Normal file
View file

@ -0,0 +1,17 @@
name: EditorConfig Checker
on:
push:
branches:
- master
pull_request:
branches:
- master
jobs:
editorconfig:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: editorconfig-checker/action-editorconfig-checker@main
- run: editorconfig-checker

5
.gitignore vendored
View file

@ -19,9 +19,11 @@ models/*
/main
/quantize
/quantize-stats
/result
/perplexity
/embedding
/benchmark-q4_0-matmult
/Pipfile
arm_neon.h
@ -33,3 +35,6 @@ compile_commands.json
.venv
__pycache__
.swiftpm
zig-out/
zig-cache/

View file

@ -56,6 +56,10 @@ option(LLAMA_AVX "llama: enable AVX"
option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_FMA "llama: enable FMA" ON)
# in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC)
option(LLAMA_F16C "llama: enable F16C" ON)
endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
@ -115,6 +119,7 @@ if (LLAMA_OPENBLAS)
add_compile_definitions(GGML_USE_OPENBLAS)
add_link_options(${BLAS_LIBRARIES})
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
else()
message(WARNING "OpenBLAS not found")
endif()
@ -139,6 +144,7 @@ if (LLAMA_ALL_WARNINGS)
-Wpedantic
-Wcast-qual
-Wno-unused-function
-Wno-multichar
)
else()
# todo : msvc
@ -151,6 +157,10 @@ if (LLAMA_ALL_WARNINGS)
endif()
if (MSVC)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
endif()
if (LLAMA_LTO)
include(CheckIPOSupported)
check_ipo_supported(RESULT result OUTPUT output)
@ -201,7 +211,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
add_compile_options(/arch:AVX)
endif()
else()
add_compile_options(-mf16c)
if (LLAMA_F16C)
add_compile_options(-mf16c)
endif()
if (LLAMA_FMA)
add_compile_options(-mfma)
endif()
@ -240,7 +252,8 @@ endif()
add_library(llama
llama.cpp
llama.h)
llama.h
llama_util.h)
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump

View file

@ -37,7 +37,7 @@ LDFLAGS =
# warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
# OS specific
# TODO: support Windows
@ -72,6 +72,7 @@ endif
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
# Use all CPU extensions that are available:
CFLAGS += -march=native -mtune=native
CXXFLAGS += -march=native -mtune=native
endif
ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
@ -141,14 +142,14 @@ default: main quantize perplexity embedding
ggml.o: ggml.c ggml.h
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
llama.o: llama.cpp llama.h
llama.o: llama.cpp llama.h llama_util.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
clean:
rm -vf *.o main quantize perplexity embedding
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
main: examples/main/main.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@ -159,16 +160,26 @@ main: examples/main/main.cpp ggml.o llama.o common.o
quantize: examples/quantize/quantize.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
libllama.so: llama.o ggml.o
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
#
# Tests
#
benchmark: ggml.o
$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
./benchmark-q4_0-matmult
.PHONY: tests
tests:
bash ./tests/run-tests.sh

View file

@ -13,7 +13,10 @@ let package = Package(
path: ".",
sources: ["ggml.c", "llama.cpp"],
publicHeadersPath: "spm-headers",
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])]
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
linkerSettings: [
.linkedFramework("Accelerate")
]
),
],
cxxLanguageStandard: .cxx11

View file

@ -1,6 +1,6 @@
# llama.cpp
![llama](https://user-images.githubusercontent.com/1991296/227761327-6d83e30e-2200-41a6-bfbb-f575231c54f4.png)
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
@ -9,8 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:**
- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
- Support for [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
## Description
@ -28,20 +28,33 @@ Please do not make conclusions about the models based on the results from this i
For all I know, it can be completely wrong. This project is for educational purposes.
New features will probably be added mostly through community contributions.
Supported platforms:
**Supported platforms:**
- [X] Mac OS
- [X] Linux
- [X] Windows (via CMake)
- [X] Docker
Supported models:
**Supported models:**
- [X] LLaMA 🦙
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
**Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
**UI:**
- [nat/openplayground](https://github.com/nat/openplayground)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
---
@ -137,14 +150,43 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
## Usage
Here are the step for the LLaMA-7B model:
Here are the step for the LLaMA-7B model.
### Get the Code
```bash
# build this repo
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
make
```
### Build
Note: For Windows, CMake or Zig can be used.
1. Use `make`
```bash
make
```
1. Use CMake
```bash
mkdir build
cd build
cmake ..
cmake --build . --config Release
```
1. Use Zig
```bash
zig build -Drelease-fast
```
### Prepare Data & Run
```bash
# obtain the original LLaMA model weights and place them in ./models
ls ./models
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
@ -162,8 +204,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
```
Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
When running the larger models, make sure you have enough disk space to store all the intermediate files.
### Memory/Disk Requirements
@ -225,7 +265,7 @@ There 26 letters in the English Alphabet
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
> List 5 words that start with "ca".
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
>
```
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
@ -236,19 +276,19 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
```bash
python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
```
- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
- The original model is saved in the same folder with a suffix `.orig`
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
- Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
- Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
- The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
`sha256sum --ignore-missing -c SHA256SUMS` on Linux
@ -266,7 +306,7 @@ convert the model from the old format to the new format with [./migrate-ggml-202
- GPT-3.5 / InstructGPT / ChatGPT:
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
### Perplexity (Measuring model quality)
You can use the `perplexity` example to measure perplexity over the given prompt. For more background,
@ -333,20 +373,22 @@ We have two Docker images available for this project:
The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
Replace `/path/to/models` below with the actual path where you downloaded the models.
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
```
On complete, you are ready to play!
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
```
or with light image:
```bash
docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
```
### Contributing
@ -367,3 +409,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
### Docs
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)

61
build.zig Normal file
View file

@ -0,0 +1,61 @@
const std = @import("std");
pub fn build(b: *std.build.Builder) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardReleaseOptions();
const want_lto = b.option(bool, "lto", "Want -fLTO");
const lib = b.addStaticLibrary("llama", null);
lib.want_lto = want_lto;
lib.setTarget(target);
lib.setBuildMode(optimize);
lib.linkLibCpp();
lib.addIncludePath(".");
lib.addIncludePath("examples");
lib.addCSourceFiles(&.{
"ggml.c",
}, &.{"-std=c11"});
lib.addCSourceFiles(&.{
"llama.cpp",
}, &.{"-std=c++11"});
lib.install();
const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
const exe = build_example("main", build_args);
_ = build_example("quantize", build_args);
_ = build_example("perplexity", build_args);
_ = build_example("embedding", build_args);
// create "zig build run" command for ./main
const run_cmd = exe.run();
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| {
run_cmd.addArgs(args);
}
const run_step = b.step("run", "Run the app");
run_step.dependOn(&run_cmd.step);
}
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
const b = args.b;
const lib = args.lib;
const want_lto = args.want_lto;
const exe = b.addExecutable(name, null);
exe.want_lto = want_lto;
lib.setTarget(args.target);
lib.setBuildMode(args.optimize);
exe.addIncludePath(".");
exe.addIncludePath("examples");
exe.addCSourceFiles(&.{
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
"examples/common.cpp",
}, &.{"-std=c++11"});
exe.linkLibrary(lib);
exe.install();
return exe;
}

View file

@ -31,6 +31,7 @@ if (EMSCRIPTEN)
else()
add_subdirectory(main)
add_subdirectory(quantize)
add_subdirectory(quantize-stats)
add_subdirectory(perplexity)
add_subdirectory(embedding)
endif()

49
examples/Miku.sh Executable file
View file

@ -0,0 +1,49 @@
#!/bin/bash
set -e
AI_NAME="${AI_NAME:-Miku}"
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}"
N_PREDICTS="${N_PREDICTS:-4096}"
GEN_OPTIONS=(--batch_size 1024
--ctx_size 2048
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
--temp 0.7
--top_k 40
--top_p 0.5)
if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD")
fi
./main "${GEN_OPTIONS[@]}" \
--model "$MODEL" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--reverse-prompt "${USER_NAME}:" \
--prompt "
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
The conversation is only between ${USER_NAME} and ${AI_NAME}
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
${AI_NAME} can only communicate through text, so she can't send images or videos.
${USER_NAME}: Hello!
${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
${AI_NAME}: What do you like to do in your free time? ^_^
${USER_NAME}:" "$@"

View file

@ -7,4 +7,4 @@
cd `dirname $0`
cd ..
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7

View file

@ -0,0 +1,270 @@
/*
License: MIT License
Changelog:
- 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
*/
#include <locale.h>
#include "ggml.h"
#include <assert.h>
#include <math.h>
#include <cstring>
#include <cstdio>
#include <cinttypes>
#include <unordered_map>
#include <queue>
#include <string.h>
#include <cassert>
#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
float tensor_sum_elements(struct ggml_tensor * tensor) {
float sum = 0;
if (tensor->type==6) {
for (int j = 0; j < tensor->ne[1]; j++) {
for (int k = 0; k < tensor->ne[0]; k++) {
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
}
}
}
return sum;
}
/*
These are mapping to unknown
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
GGML_TYPE_COUNT,
*/
#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
{ float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
struct benchmark_params_struct {
int32_t n_threads = 1;
int32_t n_iterations = 10;
};
void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations);
fprintf(stderr, "\n");
}
int main(int argc, char ** argv) {
struct benchmark_params_struct benchmark_params;
bool invalid_param = false;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
benchmark_params.n_threads = std::stoi(argv[i]);
} else if (arg == "-i" || arg == "--iter") {
if (++i >= argc) {
invalid_param = true;
break;
}
benchmark_params.n_iterations = std::stoi(argv[i]);
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, benchmark_params);
exit(0);
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv, benchmark_params);
exit(1);
}
}
// create the ggml context
printf("Starting Test\n");
struct ggml_context * ctx;
//const int sizex = 4096;
//const int sizey = 11008;
#undef VERBOSE_DEBUGGING
#ifndef VERBOSE_DEBUGGING
const int sizey = 4096;
const int sizex = 11008;
const int sizez = 128;
#else
/* Working - let's increase size */
const int sizey = 1;
const int sizex = (8*32);
const int sizez = 1;
/*const int sizey = 1;
const int sizex = 3*(8*32);
const int sizez = 1;*/
#endif
//printf("Memsize required = %i\n", sizex*sizex);
ggml_type wtype = GGML_TYPE_F32;
size_t ctx_size = 0;
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizeof(float);
ctx_size += 1024*1024*100;
printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/* no_alloc =*/ 0
};
ctx = ggml_init(params);
if (!ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
printf("Creating new tensors\n");
// printf("Creating new tensor m1\n");
struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
ggml_set_f32(m11, 1.0f);
// printf("Creating new tensor m1\n");
struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
ggml_set_f32(m12, 1.5f);
// printf("Creating new tensor m2\n");
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
ggml_set_f32(m2, 2.0f);
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
// printf("Creating new tensor m11xm2\n");
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
// printf("Creating compute graph\n");
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
gf.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf.n_threads);
TENSOR_DUMP(m11);
TENSOR_DUMP(m2);
ggml_graph_compute(ctx, &gf);
TENSOR_DUMP(gf.nodes[0]);
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
int32_t nelements = sizex*sizey;
int32_t ne[2] = { sizex, sizey };
std::vector<int64_t> hist_cur(1 << 4, 0);
// Set up a the benchmark matrices
// printf("Creating new tensor q11 & Running quantize\n");
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
// Set up a the compute graph
// printf("Creating new tensor q31\n");
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
// printf("Creating compute graph\n");
struct ggml_cgraph gf31 = ggml_build_forward(q31);
gf31.n_threads=benchmark_params.n_threads;
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
// printf("Creating new tensor q32\n");
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
//printf("Creating compute graph\n");
struct ggml_cgraph gf32 = ggml_build_forward(q32);
gf32.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf31.n_threads);
const int dimx = sizex;
const int dimy = sizey;
const int dimz = sizez;
long long int flops_per_dot_product = dimy + dimy;
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
// Let's use the F32 result from above as a reference for the q4_0 multiplication
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
printf("==============================================================================================\n");
for (int i=0;i<benchmark_params.n_iterations ;i++) {
long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute(ctx, &gf31);
long long int stop = ggml_time_us();
long long int usec = stop-start;
float sec = usec/1000000;
float flops_per_usec = (1.0f*flops_per_matrix)/usec;
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
i,
gf31.n_threads,
sizex, sizey, sizez, flops_per_matrix,
usec,flops_per_usec);
#ifdef VERBOSE_DEBUGGING
TENSOR_DUMP("res",gf31.nodes[0])
#endif
// Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
if (delta > allowed_delta) {
printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
sum_of_F32_reference,
sum_of_Q4_result,
delta,
allowed_delta
);
exit(0);
}
// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute(ctx, &gf32);
}
}

View file

@ -1,7 +1,5 @@
#include "common.h"
#include "ggml.h"
#include <cassert>
#include <cstring>
#include <fstream>
@ -16,12 +14,19 @@
#endif
#if defined (_WIN32)
#include <fcntl.h>
#include <io.h>
#pragma comment(lib,"kernel32.lib")
extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
const wchar_t * lpWideCharStr, int cchWideChar,
char * lpMultiByteStr, int cbMultiByte,
const char * lpDefaultChar, bool * lpUsedDefaultChar);
#define CP_UTF8 65001
#endif
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
@ -154,6 +159,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_color = true;
} else if (arg == "--mlock") {
params.use_mlock = true;
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--verbose-prompt") {
@ -233,9 +240,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
if (ggml_mlock_supported()) {
if (llama_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
@ -307,12 +317,20 @@ void win32_console_init(bool enable_color) {
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
}
// Set console output codepage to UTF8
SetConsoleOutputCP(65001); // CP_UTF8
SetConsoleOutputCP(CP_UTF8);
}
void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
// Set console input codepage to UTF8
SetConsoleCP(65001); // CP_UTF8
// Set console input codepage to UTF16
_setmode(_fileno(stdin), _O_WTEXT);
}
}
// Convert a wide Unicode string to an UTF8 string
void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
std::string strTo(size_needed, 0);
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
str = strTo;
}
#endif

View file

@ -47,6 +47,7 @@ struct gpt_params {
bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos
bool perplexity = false; // compute perplexity over the prompt
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool verbose_prompt = false; // print prompt tokens before generation
@ -92,4 +93,5 @@ void set_console_color(console_state & con_st, console_color_t color);
#if defined (_WIN32)
void win32_console_init(bool enable_color);
void win32_utf8_encode(const std::wstring & wstr, std::string & str);
#endif

View file

@ -1,3 +1,3 @@
# embedding
TODO
# embedding
TODO

View file

@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;

View file

@ -10,6 +10,6 @@ cd ..
./main --color --instruct --threads 4 \
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
--file ./prompts/alpaca.txt \
--batch_size 8 --ctx_size 2048 \
--batch_size 8 --ctx_size 2048 -n -1 \
--repeat_last_n 64 --repeat_penalty 1.3 \
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95

View file

@ -1,3 +1,3 @@
# main
TODO
# main
TODO

View file

@ -1,3 +1,8 @@
// Defines sigaction on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h"
#include "llama.h"
@ -97,6 +102,7 @@ int main(int argc, char ** argv) {
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
ctx = llama_init_from_file(params.model.c_str(), lparams);
@ -162,7 +168,7 @@ int main(int argc, char ** argv) {
}
// enable interactive mode if reverse prompt or interactive start is specified
if (params.antiprompt.size() != 0 || params.interactive_start) {
if (params.antiprompt.size() != 0 || params.interactive_start) {
params.interactive = true;
}
@ -368,6 +374,11 @@ int main(int argc, char ** argv) {
// potentially set color to indicate we are taking user input
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
#if defined (_WIN32)
// Windows: must reactivate sigint handler after each signal
signal(SIGINT, sigint_handler);
#endif
if (params.instruct) {
printf("\n> ");
}
@ -381,10 +392,19 @@ int main(int argc, char ** argv) {
std::string line;
bool another_line = true;
do {
#if defined(_WIN32)
std::wstring wline;
if (!std::getline(std::wcin, wline)) {
// input stream is bad or EOF received
return 0;
}
win32_utf8_encode(wline, line);
#else
if (!std::getline(std::cin, line)) {
// input stream is bad or EOF received
return 0;
}
#endif
if (line.empty() || line.back() != '\\') {
another_line = false;
} else {
@ -426,7 +446,7 @@ int main(int argc, char ** argv) {
}
// end of text token
if (embd.back() == llama_token_eos()) {
if (!embd.empty() && embd.back() == llama_token_eos()) {
if (params.instruct) {
is_interacting = true;
} else {

View file

@ -1,3 +1,3 @@
# perplexity
TODO
# perplexity
TODO

View file

@ -119,6 +119,7 @@ int main(int argc, char ** argv) {
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;

View file

@ -0,0 +1,4 @@
set(TARGET quantize-stats)
add_executable(${TARGET} quantize-stats.cpp)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,355 @@
#include "ggml.h"
#define LLAMA_API_INTERNAL
#include "llama.h"
#include <algorithm>
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <map>
#include <numeric>
#include <regex>
#include <string>
#include <unordered_map>
#include <vector>
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.bin";
bool verbose = false;
bool per_layer_stats = false;
bool print_histogram = false;
bool reference = false;
std::vector<std::string> include_layers;
std::vector<std::string> exclude_layers;
std::vector<enum ggml_type> include_types;
};
const int64_t SCRATCH_ELEMENTS = 32*32;
const size_t HISTOGRAM_BUCKETS = 150;
const double HISTOGRAM_RANGE = 0.03;
struct error_stats {
size_t num_samples;
double total_error;
double max_error;
uint64_t error_histogram[HISTOGRAM_BUCKETS];
};
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
quantize_stats_params params;
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, " -r, --reference\n");
fprintf(stderr, " use reference implementation (default: false)\n");
fprintf(stderr, " -v, --verbose\n");
fprintf(stderr, " verbose output (default: false)\n");
fprintf(stderr, " -p, --per-layer-stats\n");
fprintf(stderr, " print stats per layer (default: false)\n");
fprintf(stderr, " --histogram\n");
fprintf(stderr, " print error histogram (default: false)\n");
fprintf(stderr, " -l LAYER, --include-layer LAYER\n");
fprintf(stderr, " only test layers matching pattern\n");
fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n");
fprintf(stderr, " exclude layers matching pattern\n");
fprintf(stderr, " -t TYPE, --type TYPE\n");
fprintf(stderr, " only test given type (q4_0, q4_1)\n");
fprintf(stderr, "\n");
}
// Check if a layer is included/excluded by command line
bool layer_included(const quantize_stats_params params, const std::string & layer) {
for (const auto& excluded : params.exclude_layers) {
if (std::regex_search(layer, std::regex(excluded))) {
return false;
}
}
for (const auto& included : params.include_layers) {
if (std::regex_search(layer, std::regex(included))) {
return true;
}
}
return params.include_layers.empty();
}
// Update error statistics given vectors with the before/after result of quantization
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
for (int64_t i = 0; i < nelements; i++) {
double diff = input[i] - output[i];
stats.total_error += diff * diff;
stats.max_error = fmax(fabs(diff), stats.max_error);
stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
}
stats.num_samples += nelements;
}
double find_quantile(const error_stats & stats, double quantile) {
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
double accum = 0;
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
accum += stats.error_histogram[i];
if (accum >= sum*quantile) {
return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
}
}
return INFINITY;
}
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
double median = find_quantile(stats, .5);
double pct95 = find_quantile(stats, .95);
printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
if (print_histogram) {
printf("Error distribution:\n");
for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
}
}
}
// copied from ggml.h - verify that we can access this as a flat array
static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
// Run quantization function for a single layer and update error stats
void test_roundtrip_on_layer(
std::string & name,
bool print_layer_stats,
const quantize_fns_t & qfns,
bool use_reference,
const ggml_tensor * layer,
float * input_scratch,
char *quantized_scratch,
float * output_scratch,
error_stats & total_error) {
assert(tensor_is_contiguous(layer));
error_stats layer_error {};
int64_t nelements = ggml_nelements(layer);
for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) {
int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset);
if (layer->type == GGML_TYPE_F16) {
for (int i = 0; i < chunk_size; i++) {
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
}
} else {
input_scratch = ggml_get_data_f32(layer) + offset;
}
if (use_reference) {
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
} else {
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
}
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
if (print_layer_stats) {
update_error_stats(chunk_size, input_scratch, output_scratch, layer_error);
}
}
if (print_layer_stats) {
print_error_stats(name, layer_error, false);
}
}
int main(int argc, char ** argv) {
ggml_time_init();
quantize_stats_params params;
// read command line
bool invalid_param = false;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-h" || arg == "--help") {
quantize_stats_print_usage(argc, argv);
exit(0);
} else if (arg == "-r" || arg == "--reference") {
params.reference = true;
} else if (arg == "-v") {
params.verbose = true;
} else if (arg == "-p" || arg == "--per-layer-stats") {
params.per_layer_stats = true;
} else if (arg == "--histogram") {
params.print_histogram = true;
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model = argv[i];
} else if (arg == "-l" || arg == "--include-layer") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.include_layers.push_back(argv[i]);
} else if (arg == "-L" || arg == "--exclude-layer") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.exclude_layers.push_back(argv[i]);
} else if (arg == "-t" || arg == "--type") {
if (++i >= argc) {
invalid_param = true;
break;
}
int j;
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
// find match
}
if (j < GGML_TYPE_COUNT) {
params.include_types.push_back((ggml_type) j);
} else {
fprintf(stderr, "error: %s not in list of types\n", argv[i]);
invalid_param = true;
}
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
quantize_stats_print_usage(argc, argv);
return 1;
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
quantize_stats_print_usage(argc, argv);
return 1;
}
// load the model
fprintf(stderr, "Loading model\n");
const int64_t t_main_start_us = ggml_time_us();
llama_context * ctx;
{
auto lparams = llama_context_default_params();
lparams.n_ctx = 256;
lparams.n_parts = 1;
lparams.seed = 1;
lparams.f16_kv = false;
lparams.use_mlock = false;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
const auto &tensors = llama_internal_get_tensor_map(ctx);
// check layer tensors
int included_layers = 0;
int64_t max_nelements = 0;
bool is_f16 = false;
for (const auto& kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) {
continue;
}
if (params.verbose) {
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
}
if (kv_tensor.second->type == GGML_TYPE_F16) {
is_f16 = true;
} else if (kv_tensor.second->type != GGML_TYPE_F32) {
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
llama_free(ctx);
return 1;
}
included_layers++;
max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
}
if (is_f16) {
printf("note: source model is f16\n");
}
printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
// allocate scratch space
std::vector<float> input_scratch(SCRATCH_ELEMENTS);
std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
std::vector<float> output_scratch(SCRATCH_ELEMENTS);
// loop throught quantization types
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue;
}
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
if (params.verbose) {
printf("testing %s ...\n", type_strs[i]);
}
error_stats global_stats {};
for (const auto& kv_tensor : tensors) {
if (!layer_included(params, kv_tensor.first)) {
continue;
}
if (params.verbose) {
printf(" %s ...\n", kv_tensor.first.c_str());
}
std::string layer_name { type_strs[i] };
layer_name += "::" + kv_tensor.first;
test_roundtrip_on_layer(
layer_name,
params.per_layer_stats,
qfns,
params.reference,
kv_tensor.second,
input_scratch.data(),
quantized_scratch.data(),
output_scratch.data(),
global_stats
);
}
print_error_stats(type_strs[i], global_stats, params.print_histogram);
}
}
llama_free(ctx);
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
printf("\n");
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
}
return 0;
}

View file

@ -5,15 +5,15 @@
#include <string>
// usage:
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
//
int main(int argc, char ** argv) {
ggml_time_init();
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
fprintf(stderr, " type = 3 - q4_1\n");
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
return 1;
}
@ -27,7 +27,7 @@ int main(int argc, char ** argv) {
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
const int itype = atoi(argv[3]);
const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}

View file

@ -28,8 +28,9 @@
];
installPhase = ''
mkdir -p $out/bin
mv bin/main $out/bin/llama
mv bin/quantize $out/bin/quantize
mv bin/* $out/bin/
mv $out/bin/main $out/bin/llama
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
chmod +x $out/bin/convert-pth-to-ggml

899
ggml.c

File diff suppressed because it is too large Load diff

69
ggml.h
View file

@ -198,13 +198,14 @@ struct ggml_object;
struct ggml_context;
enum ggml_type {
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
// explicitly numbered values are used in llama.cpp files
GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
GGML_TYPE_F16,
GGML_TYPE_F32,
GGML_TYPE_COUNT,
};
@ -236,6 +237,7 @@ enum ggml_op {
GGML_OP_SCALE,
GGML_OP_CPY,
GGML_OP_CONT,
GGML_OP_RESHAPE,
GGML_OP_VIEW,
GGML_OP_PERMUTE,
@ -253,6 +255,19 @@ enum ggml_op {
GGML_OP_COUNT,
};
// ggml object
struct ggml_object {
size_t offs;
size_t size;
struct ggml_object * next;
char padding[8];
};
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
// n-dimensional tensor
struct ggml_tensor {
enum ggml_type type;
@ -344,13 +359,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
bool ggml_mlock_supported(void);
bool ggml_mlock(
struct ggml_context * ctx,
const void *opt_extra_addr,
size_t opt_extra_len,
char **err_p);
struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
@ -519,6 +527,11 @@ struct ggml_tensor * ggml_cpy(
struct ggml_tensor * a,
struct ggml_tensor * b);
// make contiguous
struct ggml_tensor * ggml_cont(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return view(a), b specifies the new shape
// TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape(
@ -558,6 +571,16 @@ struct ggml_tensor * ggml_view_2d(
size_t nb1, // row stride in bytes
size_t offset);
struct ggml_tensor * ggml_view_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
size_t nb1, // row stride in bytes
size_t nb2, // slice stride in bytes
size_t offset);
struct ggml_tensor * ggml_permute(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -773,6 +796,30 @@ int ggml_cpu_has_blas(void);
int ggml_cpu_has_sse3(void);
int ggml_cpu_has_vsx(void);
//
// Internal types and functions exposed for tests and benchmarks
//
#ifdef __cplusplus
// restrict not standard in C++
#define GGML_RESTRICT
#else
#define GGML_RESTRICT restrict
#endif
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef struct {
dequantize_row_q_t dequantize_row_q;
quantize_row_q_t quantize_row_q;
quantize_row_q_t quantize_row_q_reference;
vec_dot_q_t vec_dot_q;
} quantize_fns_t;
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
#ifdef __cplusplus
}
#endif

1612
llama.cpp

File diff suppressed because it is too large Load diff

26
llama.h
View file

@ -55,6 +55,7 @@ extern "C" {
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only
@ -64,8 +65,20 @@ extern "C" {
void * progress_callback_user_data;
};
// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
};
LLAMA_API struct llama_context_params llama_context_default_params();
LLAMA_API bool llama_mmap_supported();
LLAMA_API bool llama_mlock_supported();
// Various functions for loading a ggml llama model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
@ -81,7 +94,7 @@ extern "C" {
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype);
enum llama_ftype ftype);
// Returns the KV cache that will contain the context for the
// ongoing prediction with the model.
@ -166,4 +179,15 @@ extern "C" {
}
#endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL
#include <vector>
#include <string>
struct ggml_tensor;
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
#endif
#endif // LLAMA_H

389
llama_util.h Executable file
View file

@ -0,0 +1,389 @@
// Internal header to be included only by llama.cpp.
// Contains wrappers around OS interfaces.
#ifndef LLAMA_UTIL_H
#define LLAMA_UTIL_H
#include <cstdio>
#include <cstdint>
#include <cerrno>
#include <cstring>
#include <cstdarg>
#include <cstdlib>
#include <climits>
#include <string>
#include <vector>
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
#endif
#endif
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <io.h>
#include <stdio.h> // for _fseeki64
#endif
#define LLAMA_ASSERT(x) \
do { \
if (!(x)) { \
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
abort(); \
} \
} while (0)
#ifdef __GNUC__
__attribute__((format(printf, 1, 2)))
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
LLAMA_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
LLAMA_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
};
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
throw format("failed to open %s: %s", fname, std::strerror(errno));
}
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
LLAMA_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void seek(size_t offset, int whence) {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
LLAMA_ASSERT(ret == 0); // same
}
void read_raw(void * ptr, size_t size) {
if (size == 0) {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp);
if (ferror(fp)) {
throw format("read error: %s", strerror(errno));
}
if (ret != 1) {
throw std::string("unexpectedly reached end of file");
}
}
std::uint32_t read_u32() {
std::uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
std::string read_string(std::uint32_t len) {
std::vector<char> chars(len);
read_raw(chars.data(), len);
return std::string(chars.data(), len);
}
void write_raw(const void * ptr, size_t size) {
if (size == 0) {
return;
}
errno = 0;
size_t ret = std::fwrite(ptr, size, 1, fp);
if (ret != 1) {
throw format("write error: %s", strerror(errno));
}
}
void write_u32(std::uint32_t val) {
write_raw(&val, sizeof(val));
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
};
#if defined(_WIN32)
static std::string llama_format_win_err(DWORD err) {
LPSTR buf;
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
if (!size) {
return "FormatMessageA failed";
}
std::string ret(buf, size);
LocalFree(buf);
return ret;
}
#endif
struct llama_mmap {
void * addr;
size_t size;
llama_mmap(const llama_mmap &) = delete;
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
#ifdef __linux__
flags |= MAP_POPULATE;
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
close(fd);
if (addr == MAP_FAILED) {
throw format("mmap failed: %s", strerror(errno));
}
// Advise the kernel to preload the mapped memory
if (madvise(addr, file->size, MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno));
}
}
~llama_mmap() {
munmap(addr, size);
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file) {
size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
DWORD error = GetLastError();
CloseHandle(hFile);
if (hMapping == NULL) {
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
}
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
error = GetLastError();
CloseHandle(hMapping);
if (addr == NULL) {
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
}
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
// Advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size;
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
#else
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
}
~llama_mmap() {
if (!UnmapViewOfFile(addr)) {
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file *) {
throw std::string("mmap not supported");
}
#endif
};
// Represents some region of memory being locked using mlock or VirtualLock;
// will automatically unlock on destruction.
struct llama_mlock {
void * addr = NULL;
size_t size = 0;
bool failed_already = false;
llama_mlock() {}
llama_mlock(const llama_mlock &) = delete;
~llama_mlock() {
if (size) {
raw_unlock(addr, size);
}
}
void init(void * addr) {
LLAMA_ASSERT(this->addr == NULL && this->size == 0);
this->addr = addr;
}
void grow_to(size_t target_size) {
LLAMA_ASSERT(addr);
if (failed_already) {
return;
}
size_t granularity = lock_granularity();
target_size = (target_size + granularity - 1) & ~(granularity - 1);
if (target_size > size) {
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
size = target_size;
} else {
failed_already = true;
}
}
}
#ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
return (size_t) sysconf(_SC_PAGESIZE);
}
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) {
if (!mlock(addr, size)) {
return true;
} else {
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
size, this->size, std::strerror(errno));
return false;
}
}
#undef MLOCK_SUGGESTION
void raw_unlock(void * addr, size_t size) {
if (munlock(addr, size)) {
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
}
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
SYSTEM_INFO si;
GetSystemInfo(&si);
return (size_t) si.dwPageSize;
}
bool raw_lock(void * addr, size_t size) {
for (int tries = 1; ; tries++) {
if (VirtualLock(addr, size)) {
return true;
}
if (tries == 2) {
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
size, this->size, llama_format_win_err(GetLastError()).c_str());
return false;
}
// It failed but this was only the first try; increase the working
// set size and try again.
SIZE_T min_ws_size, max_ws_size;
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
return false;
}
// Per MSDN: "The maximum number of pages that a process can lock
// is equal to the number of pages in its minimum working set minus
// a small overhead."
// Hopefully a megabyte is enough overhead:
size_t increment = size + 1048576;
// The minimum must be <= the maximum, so we need to increase both:
min_ws_size += increment;
max_ws_size += increment;
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
return false;
}
}
}
void raw_unlock(void * addr, size_t size) {
if (!VirtualUnlock(addr, size)) {
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
void raw_lock(const void * addr, size_t size) {
fprintf(stderr, "warning: mlock not supported on this system\n");
}
void raw_unlock(const void * addr, size_t size) {}
#endif
};
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
struct llama_buffer {
uint8_t * addr = NULL;
size_t size = 0;
void resize(size_t size) {
delete[] addr;
addr = new uint8_t[size];
this->size = size;
}
~llama_buffer() {
delete[] addr;
}
};
#endif

BIN
media/llama-leader.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 195 KiB

BIN
media/llama0-banner.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

BIN
media/llama0-logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 176 KiB

BIN
media/llama1-banner.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

BIN
media/llama1-logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

View file

@ -4,4 +4,4 @@ User: Hello, Bob.
Bob: Hello. How may I help you today?
User: Please tell me the largest city in Europe.
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
User:
User:

View file

@ -15,4 +15,4 @@ Answer: The calculate tool says it is 9.3333333333
Question: What is capital of france?
Thought: Do I need to use an action? No, I know the answer
Answer: Paris is the capital of France
Question:
Question: