rename llama|main -> llama-cli; consistent RPM bin prefixes
This commit is contained in:
		
							parent
							
								
									347f30803f
								
							
						
					
					
						commit
						5265c15d4c
					
				
					 51 changed files with 142 additions and 144 deletions
				
			
		|  | @ -15,7 +15,7 @@ node('x86_runner1'){            // Running on x86 runner containing latest vecto | ||||||
|     stage('Running llama.cpp'){ |     stage('Running llama.cpp'){ | ||||||
|         sh'''#!/bin/bash |         sh'''#!/bin/bash | ||||||
|             module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc |             module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc | ||||||
|             qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64 |             qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64 | ||||||
|             cat llama_log.txt                   # Printing results |             cat llama_log.txt                   # Printing results | ||||||
|         ''' |         ''' | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  | @ -36,9 +36,9 @@ make -j LLAMA_CLBLAST=1 | ||||||
| 
 | 
 | ||||||
| %install | %install | ||||||
| mkdir -p %{buildroot}%{_bindir}/ | mkdir -p %{buildroot}%{_bindir}/ | ||||||
| cp -p llama %{buildroot}%{_bindir}/llamaclblast | cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli | ||||||
| cp -p llama-server %{buildroot}%{_bindir}/llamaclblastserver | cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server | ||||||
| cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple | cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple | ||||||
| 
 | 
 | ||||||
| mkdir -p %{buildroot}/usr/lib/systemd/system | mkdir -p %{buildroot}/usr/lib/systemd/system | ||||||
| %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service | %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service | ||||||
|  | @ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t | ||||||
| [Service] | [Service] | ||||||
| Type=simple | Type=simple | ||||||
| EnvironmentFile=/etc/sysconfig/llama | EnvironmentFile=/etc/sysconfig/llama | ||||||
| ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS | ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS | ||||||
| ExecReload=/bin/kill -s HUP $MAINPID | ExecReload=/bin/kill -s HUP $MAINPID | ||||||
| Restart=never | Restart=never | ||||||
| 
 | 
 | ||||||
|  | @ -67,9 +67,9 @@ rm -rf %{buildroot} | ||||||
| rm -rf %{_builddir}/* | rm -rf %{_builddir}/* | ||||||
| 
 | 
 | ||||||
| %files | %files | ||||||
| %{_bindir}/llamaclblast | %{_bindir}/llama-clblast-cli | ||||||
| %{_bindir}/llamaclblastserver | %{_bindir}/llama-clblast-server | ||||||
| %{_bindir}/llamaclblastsimple | %{_bindir}/llama-clblast-simple | ||||||
| /usr/lib/systemd/system/llamaclblast.service | /usr/lib/systemd/system/llamaclblast.service | ||||||
| %config /etc/sysconfig/llama | %config /etc/sysconfig/llama | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -36,9 +36,9 @@ make -j LLAMA_CUDA=1 | ||||||
| 
 | 
 | ||||||
| %install | %install | ||||||
| mkdir -p %{buildroot}%{_bindir}/ | mkdir -p %{buildroot}%{_bindir}/ | ||||||
| cp -p llama %{buildroot}%{_bindir}/llamacppcuda | cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli | ||||||
| cp -p llama-server %{buildroot}%{_bindir}/llamacppcudaserver | cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server | ||||||
| cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple | cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple | ||||||
| 
 | 
 | ||||||
| mkdir -p %{buildroot}/usr/lib/systemd/system | mkdir -p %{buildroot}/usr/lib/systemd/system | ||||||
| %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service | %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service | ||||||
|  | @ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t | ||||||
| [Service] | [Service] | ||||||
| Type=simple | Type=simple | ||||||
| EnvironmentFile=/etc/sysconfig/llama | EnvironmentFile=/etc/sysconfig/llama | ||||||
| ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS | ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS | ||||||
| ExecReload=/bin/kill -s HUP $MAINPID | ExecReload=/bin/kill -s HUP $MAINPID | ||||||
| Restart=never | Restart=never | ||||||
| 
 | 
 | ||||||
|  | @ -67,9 +67,9 @@ rm -rf %{buildroot} | ||||||
| rm -rf %{_builddir}/* | rm -rf %{_builddir}/* | ||||||
| 
 | 
 | ||||||
| %files | %files | ||||||
| %{_bindir}/llamacppcuda | %{_bindir}/llama-cuda-cli | ||||||
| %{_bindir}/llamacppcudaserver | %{_bindir}/llama-cuda-server | ||||||
| %{_bindir}/llamacppcudasimple | %{_bindir}/llama-cuda-simple | ||||||
| /usr/lib/systemd/system/llamacuda.service | /usr/lib/systemd/system/llamacuda.service | ||||||
| %config /etc/sysconfig/llama | %config /etc/sysconfig/llama | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -38,9 +38,9 @@ make -j | ||||||
| 
 | 
 | ||||||
| %install | %install | ||||||
| mkdir -p %{buildroot}%{_bindir}/ | mkdir -p %{buildroot}%{_bindir}/ | ||||||
| cp -p llama %{buildroot}%{_bindir}/llama | cp -p llama-cli %{buildroot}%{_bindir}/llama-cli | ||||||
| cp -p llama-server %{buildroot}%{_bindir}/llamaserver | cp -p llama-server %{buildroot}%{_bindir}/llama-server | ||||||
| cp -p simple %{buildroot}%{_bindir}/llamasimple | cp -p llama-simple %{buildroot}%{_bindir}/llama-simple | ||||||
| 
 | 
 | ||||||
| mkdir -p %{buildroot}/usr/lib/systemd/system | mkdir -p %{buildroot}/usr/lib/systemd/system | ||||||
| %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service | %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service | ||||||
|  | @ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t | ||||||
| [Service] | [Service] | ||||||
| Type=simple | Type=simple | ||||||
| EnvironmentFile=/etc/sysconfig/llama | EnvironmentFile=/etc/sysconfig/llama | ||||||
| ExecStart=/usr/bin/llamaserver $LLAMA_ARGS | ExecStart=/usr/bin/llama-server $LLAMA_ARGS | ||||||
| ExecReload=/bin/kill -s HUP $MAINPID | ExecReload=/bin/kill -s HUP $MAINPID | ||||||
| Restart=never | Restart=never | ||||||
| 
 | 
 | ||||||
|  | @ -69,9 +69,9 @@ rm -rf %{buildroot} | ||||||
| rm -rf %{_builddir}/* | rm -rf %{_builddir}/* | ||||||
| 
 | 
 | ||||||
| %files | %files | ||||||
| %{_bindir}/llama | %{_bindir}/llama-cli | ||||||
| %{_bindir}/llamaserver | %{_bindir}/llama-server | ||||||
| %{_bindir}/llamasimple | %{_bindir}/llama-simple | ||||||
| /usr/lib/systemd/system/llama.service | /usr/lib/systemd/system/llama.service | ||||||
| %config /etc/sysconfig/llama | %config /etc/sysconfig/llama | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -30,6 +30,6 @@ FROM ${BASE_CUDA_RUN_CONTAINER} as runtime | ||||||
| RUN apt-get update && \ | RUN apt-get update && \ | ||||||
|     apt-get install -y libgomp1 |     apt-get install -y libgomp1 | ||||||
| 
 | 
 | ||||||
| COPY --from=build /app/llama /llama | COPY --from=build /app/llama-cli /llama-cli | ||||||
| 
 | 
 | ||||||
| ENTRYPOINT [ "/main" ] | ENTRYPOINT [ "/llama-cli" ] | ||||||
|  |  | ||||||
|  | @ -27,8 +27,8 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ | ||||||
| 
 | 
 | ||||||
| FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime | FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime | ||||||
| 
 | 
 | ||||||
| COPY --from=build /app/build/bin/llama /llama | COPY --from=build /app/build/bin/llama-cli /llama-cli | ||||||
| 
 | 
 | ||||||
| ENV LC_ALL=C.utf8 | ENV LC_ALL=C.utf8 | ||||||
| 
 | 
 | ||||||
| ENTRYPOINT [ "/main" ] | ENTRYPOINT [ "/llama-cli" ] | ||||||
|  |  | ||||||
|  | @ -42,4 +42,4 @@ ENV CXX=/opt/rocm/llvm/bin/clang++ | ||||||
| 
 | 
 | ||||||
| RUN make -j$(nproc) llama | RUN make -j$(nproc) llama | ||||||
| 
 | 
 | ||||||
| ENTRYPOINT [ "/app/main" ] | ENTRYPOINT [ "/app/llama-cli" ] | ||||||
|  |  | ||||||
|  | @ -19,9 +19,9 @@ RUN cmake -B build -DLLAMA_VULKAN=1 && \ | ||||||
| 
 | 
 | ||||||
| # Clean up | # Clean up | ||||||
| WORKDIR / | WORKDIR / | ||||||
| RUN cp /app/build/bin/llama /llama && \ | RUN cp /app/build/bin/llama-cli /llama-cli && \ | ||||||
|     rm -rf /app |     rm -rf /app | ||||||
| 
 | 
 | ||||||
| ENV LC_ALL=C.utf8 | ENV LC_ALL=C.utf8 | ||||||
| 
 | 
 | ||||||
| ENTRYPOINT [ "/main" ] | ENTRYPOINT [ "/llama-cli" ] | ||||||
|  |  | ||||||
|  | @ -16,8 +16,8 @@ FROM ubuntu:$UBUNTU_VERSION as runtime | ||||||
| RUN apt-get update && \ | RUN apt-get update && \ | ||||||
|     apt-get install -y libgomp1 |     apt-get install -y libgomp1 | ||||||
| 
 | 
 | ||||||
| COPY --from=build /app/llama /llama | COPY --from=build /app/llama-cli /llama-cli | ||||||
| 
 | 
 | ||||||
| ENV LC_ALL=C.utf8 | ENV LC_ALL=C.utf8 | ||||||
| 
 | 
 | ||||||
| ENTRYPOINT [ "/main" ] | ENTRYPOINT [ "/llama-cli" ] | ||||||
|  |  | ||||||
|  | @ -292,7 +292,7 @@ effectiveStdenv.mkDerivation ( | ||||||
|       license = lib.licenses.mit; |       license = lib.licenses.mit; | ||||||
| 
 | 
 | ||||||
|       # Accommodates `nix run` and `lib.getExe` |       # Accommodates `nix run` and `lib.getExe` | ||||||
|       mainProgram = "llama"; |       mainProgram = "llama-cli"; | ||||||
| 
 | 
 | ||||||
|       # These people might respond, on the best effort basis, if you ping them |       # These people might respond, on the best effort basis, if you ping them | ||||||
|       # in case of Nix-specific regressions or for reviewing Nix-specific PRs. |       # in case of Nix-specific regressions or for reviewing Nix-specific PRs. | ||||||
|  |  | ||||||
|  | @ -12,7 +12,7 @@ if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then | ||||||
| elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then | elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then | ||||||
|     ./llama-quantize "$@" |     ./llama-quantize "$@" | ||||||
| elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then | elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then | ||||||
|     ./llama "$@" |     ./llama-cli "$@" | ||||||
| elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then | elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then | ||||||
|     ./llama-finetune "$@" |     ./llama-finetune "$@" | ||||||
| elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then | elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then | ||||||
|  |  | ||||||
|  | @ -12,7 +12,7 @@ build*/ | ||||||
| 
 | 
 | ||||||
| models/* | models/* | ||||||
| 
 | 
 | ||||||
| /llama | /llama-cli | ||||||
| /llama-quantize | /llama-quantize | ||||||
| 
 | 
 | ||||||
| arm_neon.h | arm_neon.h | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/01-bug-low.yml
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/01-bug-low.yml
									
										
									
									
										vendored
									
									
								
							|  | @ -24,7 +24,7 @@ body: | ||||||
|       label: Name and Version |       label: Name and Version | ||||||
|       description: Which executable and which version of our software are you running? (use `--version` to get a version string) |       description: Which executable and which version of our software are you running? (use `--version` to get a version string) | ||||||
|       placeholder: | |       placeholder: | | ||||||
|         $./llama --version |         $./llama-cli --version | ||||||
|         version: 2999 (42b4109e) |         version: 2999 (42b4109e) | ||||||
|         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu |         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu | ||||||
|     validations: |     validations: | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/02-bug-medium.yml
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/02-bug-medium.yml
									
										
									
									
										vendored
									
									
								
							|  | @ -24,7 +24,7 @@ body: | ||||||
|       label: Name and Version |       label: Name and Version | ||||||
|       description: Which executable and which version of our software are you running? (use `--version` to get a version string) |       description: Which executable and which version of our software are you running? (use `--version` to get a version string) | ||||||
|       placeholder: | |       placeholder: | | ||||||
|         $./llama --version |         $./llama-cli --version | ||||||
|         version: 2999 (42b4109e) |         version: 2999 (42b4109e) | ||||||
|         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu |         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu | ||||||
|     validations: |     validations: | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/03-bug-high.yml
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/03-bug-high.yml
									
										
									
									
										vendored
									
									
								
							|  | @ -24,7 +24,7 @@ body: | ||||||
|       label: Name and Version |       label: Name and Version | ||||||
|       description: Which executable and which version of our software are you running? (use `--version` to get a version string) |       description: Which executable and which version of our software are you running? (use `--version` to get a version string) | ||||||
|       placeholder: | |       placeholder: | | ||||||
|         $./llama --version |         $./llama-cli --version | ||||||
|         version: 2999 (42b4109e) |         version: 2999 (42b4109e) | ||||||
|         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu |         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu | ||||||
|     validations: |     validations: | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/04-bug-critical.yml
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/04-bug-critical.yml
									
										
									
									
										vendored
									
									
								
							|  | @ -24,7 +24,7 @@ body: | ||||||
|       label: Name and Version |       label: Name and Version | ||||||
|       description: Which executable and which version of our software are you running? (use `--version` to get a version string) |       description: Which executable and which version of our software are you running? (use `--version` to get a version string) | ||||||
|       placeholder: | |       placeholder: | | ||||||
|         $./llama --version |         $./llama-cli --version | ||||||
|         version: 2999 (42b4109e) |         version: 2999 (42b4109e) | ||||||
|         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu |         built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu | ||||||
|     validations: |     validations: | ||||||
|  |  | ||||||
							
								
								
									
										1
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							|  | @ -47,7 +47,6 @@ models-mnt | ||||||
| 
 | 
 | ||||||
| /Pipfile | /Pipfile | ||||||
| /libllama.so | /libllama.so | ||||||
| /llama |  | ||||||
| /llama-* | /llama-* | ||||||
| llama-batched-swift | llama-batched-swift | ||||||
| /common/build-info.cpp | /common/build-info.cpp | ||||||
|  |  | ||||||
							
								
								
									
										10
									
								
								Makefile
									
										
									
									
									
								
							
							
						
						
									
										10
									
								
								Makefile
									
										
									
									
									
								
							|  | @ -1,12 +1,12 @@ | ||||||
| # Define the default target now so that it is always the first target
 | # Define the default target now so that it is always the first target
 | ||||||
| BUILD_TARGETS = \
 | BUILD_TARGETS = \
 | ||||||
| 	libllava.a \
 | 	libllava.a \
 | ||||||
| 	llama \
 |  | ||||||
| 	llama-baby \
 | 	llama-baby \
 | ||||||
| 	llama-batched \
 | 	llama-batched \
 | ||||||
| 	llama-batched-bench \
 | 	llama-batched-bench \
 | ||||||
| 	llama-bench \
 | 	llama-bench \
 | ||||||
| 	llama-benchmark-matmult \
 | 	llama-benchmark-matmult \
 | ||||||
|  | 	llama-cli \
 | ||||||
| 	llama-convert-llama2c-to-ggml \
 | 	llama-convert-llama2c-to-ggml \
 | ||||||
| 	llama-embedding \
 | 	llama-embedding \
 | ||||||
| 	llama-eval-callback \
 | 	llama-eval-callback \
 | ||||||
|  | @ -17,7 +17,7 @@ BUILD_TARGETS = \ | ||||||
| 	llama-gritlm \
 | 	llama-gritlm \
 | ||||||
| 	llama-imatrix \
 | 	llama-imatrix \
 | ||||||
| 	llama-infill \
 | 	llama-infill \
 | ||||||
| 	llama-llava \
 | 	llama-llava-cli \
 | ||||||
| 	llama-lookahead \
 | 	llama-lookahead \
 | ||||||
| 	llama-lookup \
 | 	llama-lookup \
 | ||||||
| 	llama-lookup-create \
 | 	llama-lookup-create \
 | ||||||
|  | @ -828,11 +828,11 @@ clean: | ||||||
| # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 | # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 | ||||||
| GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) | GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) | ||||||
| 
 | 
 | ||||||
| llama: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) | llama-cli: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) | ||||||
| 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) | 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) | ||||||
| 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) | 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) | ||||||
| 	@echo | 	@echo | ||||||
| 	@echo '====  Run ./llama -h for help.  ====' | 	@echo '====  Run ./llama-cli -h for help.  ====' | ||||||
| 	@echo | 	@echo | ||||||
| 
 | 
 | ||||||
| llama-infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) | llama-infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) | ||||||
|  | @ -923,7 +923,7 @@ llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) | ||||||
| libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) | libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) | ||||||
| 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual | 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual | ||||||
| 
 | 
 | ||||||
| llama-llava: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) | llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) | ||||||
| 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) | 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) | ||||||
| 	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp  -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual | 	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp  -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual | ||||||
| 	$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) | 	$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) | ||||||
|  |  | ||||||
|  | @ -77,7 +77,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, | ||||||
| *Notes:* | *Notes:* | ||||||
| 
 | 
 | ||||||
| - **Memory** | - **Memory** | ||||||
|   - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama`. |   - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`. | ||||||
| 
 | 
 | ||||||
|   - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU. |   - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU. | ||||||
| 
 | 
 | ||||||
|  | @ -313,7 +313,7 @@ Examples: | ||||||
| - Use device 0: | - Use device 0: | ||||||
| 
 | 
 | ||||||
| ```sh | ```sh | ||||||
| ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 | ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 | ||||||
| ``` | ``` | ||||||
| or run by script: | or run by script: | ||||||
| 
 | 
 | ||||||
|  | @ -324,7 +324,7 @@ or run by script: | ||||||
| - Use multiple devices: | - Use multiple devices: | ||||||
| 
 | 
 | ||||||
| ```sh | ```sh | ||||||
| ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer | ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| Otherwise, you can run the script: | Otherwise, you can run the script: | ||||||
|  |  | ||||||
							
								
								
									
										14
									
								
								README.md
									
										
									
									
									
								
							
							
						
						
									
										14
									
								
								README.md
									
										
									
									
									
								
							|  | @ -218,7 +218,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: | ||||||
| Here is a typical run using LLaMA v2 13B on M2 Ultra: | Here is a typical run using LLaMA v2 13B on M2 Ultra: | ||||||
| 
 | 
 | ||||||
| ``` | ``` | ||||||
| $ make -j && ./llama -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e | $ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e | ||||||
| I llama.cpp build info: | I llama.cpp build info: | ||||||
| I UNAME_S:  Darwin | I UNAME_S:  Darwin | ||||||
| I UNAME_P:  arm | I UNAME_P:  arm | ||||||
|  | @ -585,7 +585,7 @@ Building the program with BLAS support may lead to some performance improvements | ||||||
|   cmake -B build -DLLAMA_VULKAN=1 |   cmake -B build -DLLAMA_VULKAN=1 | ||||||
|   cmake --build build --config Release |   cmake --build build --config Release | ||||||
|   # Test the output binary (with "-ngl 33" to offload all layers to GPU) |   # Test the output binary (with "-ngl 33" to offload all layers to GPU) | ||||||
|   ./bin/llama -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 |   ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 | ||||||
| 
 | 
 | ||||||
|   # You should see in the output, ggml_vulkan detected your GPU. For example: |   # You should see in the output, ggml_vulkan detected your GPU. For example: | ||||||
|   # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 |   # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 | ||||||
|  | @ -632,7 +632,7 @@ python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| # start inference on a gguf model | # start inference on a gguf model | ||||||
| ./llama -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 | ./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| When running the larger models, make sure you have enough disk space to store all the intermediate files. | When running the larger models, make sure you have enough disk space to store all the intermediate files. | ||||||
|  | @ -731,7 +731,7 @@ Here is an example of a few-shot interaction, invoked with the command | ||||||
| ./examples/chat-13B.sh | ./examples/chat-13B.sh | ||||||
| 
 | 
 | ||||||
| # custom arguments using a 13B model | # custom arguments using a 13B model | ||||||
| ./llama -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt | ./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program. | Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program. | ||||||
|  | @ -740,7 +740,7 @@ Note the use of `--color` to distinguish between user input and generated text. | ||||||
| 
 | 
 | ||||||
| ### Persistent Interaction | ### Persistent Interaction | ||||||
| 
 | 
 | ||||||
| The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. | The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| # Start a new chat | # Start a new chat | ||||||
|  | @ -762,7 +762,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \ | ||||||
| `llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only: | `llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only: | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| ./llama -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' | ./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). | The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). | ||||||
|  | @ -869,7 +869,7 @@ $mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/ho | ||||||
| Now, you can start chatting: | Now, you can start chatting: | ||||||
| ``` | ``` | ||||||
| $cd /data/data/com.termux/files/home/bin | $cd /data/data/com.termux/files/home/bin | ||||||
| $./llama -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml | $./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| Here's a demo of an interactive session running on Pixel 5 phone: | Here's a demo of an interactive session running on Pixel 5 phone: | ||||||
|  |  | ||||||
							
								
								
									
										66
									
								
								ci/run.sh
									
										
									
									
									
								
							
							
						
						
									
										66
									
								
								ci/run.sh
									
										
									
									
									
								
							|  | @ -314,17 +314,17 @@ function gg_run_open_llama_7b_v2 { | ||||||
|     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k |     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k | ||||||
|     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k |     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k | ||||||
| 
 | 
 | ||||||
|     (time ./bin/llama --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log |     (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log | ||||||
|     (time ./bin/llama --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log |     (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log | ||||||
|     (time ./bin/llama --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log |     (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log | ||||||
|     (time ./bin/llama --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log |     (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log | ||||||
|     (time ./bin/llama --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log |     (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log | ||||||
|     (time ./bin/llama --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log |     (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log | ||||||
|     (time ./bin/llama --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log |     (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log | ||||||
|     (time ./bin/llama --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log |     (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log | ||||||
|     (time ./bin/llama --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log |     (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log | ||||||
|     (time ./bin/llama --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log |     (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log | ||||||
|     (time ./bin/llama --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log |     (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log | ||||||
| 
 | 
 | ||||||
|     (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log |     (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log | ||||||
|     (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log |     (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log | ||||||
|  | @ -448,17 +448,17 @@ function gg_run_pythia_1_4b { | ||||||
|     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k |     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k | ||||||
|     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k |     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k | ||||||
| 
 | 
 | ||||||
|     (time ./bin/llama --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log |     (time ./bin/llama-cli --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log | ||||||
|     (time ./bin/llama --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log |     (time ./bin/llama-cli --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log | ||||||
|     (time ./bin/llama --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log |     (time ./bin/llama-cli --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log | ||||||
|     (time ./bin/llama --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log |     (time ./bin/llama-cli --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log | ||||||
|     (time ./bin/llama --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log |     (time ./bin/llama-cli --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log | ||||||
|     (time ./bin/llama --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log |     (time ./bin/llama-cli --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log | ||||||
|     (time ./bin/llama --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log |     (time ./bin/llama-cli --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log | ||||||
|     (time ./bin/llama --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log |     (time ./bin/llama-cli --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log | ||||||
|     (time ./bin/llama --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log |     (time ./bin/llama-cli --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log | ||||||
|     (time ./bin/llama --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log |     (time ./bin/llama-cli --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log | ||||||
|     (time ./bin/llama --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log |     (time ./bin/llama-cli --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log | ||||||
| 
 | 
 | ||||||
|     (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log |     (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log | ||||||
|     (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log |     (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log | ||||||
|  | @ -580,17 +580,17 @@ function gg_run_pythia_2_8b { | ||||||
|     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k |     ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k | ||||||
|     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k |     ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k | ||||||
| 
 | 
 | ||||||
|     (time ./bin/llama --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log |     (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log | ||||||
|     (time ./bin/llama --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log |     (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log | ||||||
|     (time ./bin/llama --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log |     (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log | ||||||
|     (time ./bin/llama --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log |     (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log | ||||||
|     (time ./bin/llama --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log |     (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log | ||||||
|     (time ./bin/llama --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log |     (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log | ||||||
|     (time ./bin/llama --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log |     (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log | ||||||
|     (time ./bin/llama --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log |     (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log | ||||||
|     (time ./bin/llama --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log |     (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log | ||||||
|     (time ./bin/llama --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log |     (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log | ||||||
|     (time ./bin/llama --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log |     (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log | ||||||
| 
 | 
 | ||||||
|     (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log |     (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log | ||||||
|     (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log |     (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log | ||||||
|  |  | ||||||
|  | @ -3,7 +3,7 @@ | ||||||
| ## Verifying that the model is running on the GPU with CUDA | ## Verifying that the model is running on the GPU with CUDA | ||||||
| Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: | Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: | ||||||
| ```shell | ```shell | ||||||
| ./llama -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " | ./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines: | When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines: | ||||||
|  | @ -27,7 +27,7 @@ RAM: 32GB | ||||||
| 
 | 
 | ||||||
| Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML) | Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML) | ||||||
| 
 | 
 | ||||||
| Run command: `./llama -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` | Run command: `./llama-cli -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` | ||||||
| 
 | 
 | ||||||
| Result: | Result: | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then | ||||||
|     GEN_OPTIONS+=(--threads "$N_THREAD") |     GEN_OPTIONS+=(--threads "$N_THREAD") | ||||||
| fi | fi | ||||||
| 
 | 
 | ||||||
| ./llama "${GEN_OPTIONS[@]}" \ | ./llama-cli "${GEN_OPTIONS[@]}" \ | ||||||
|     --model "$MODEL" \ |     --model "$MODEL" \ | ||||||
|     --in-prefix " " \ |     --in-prefix " " \ | ||||||
|     --in-suffix "${AI_NAME}:" \ |     --in-suffix "${AI_NAME}:" \ | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ | ||||||
| cd `dirname $0` | cd `dirname $0` | ||||||
| cd .. | cd .. | ||||||
| 
 | 
 | ||||||
| ./llama -m ./models/alpaca.13b.ggmlv3.q8_0.bin \ | ./llama-cli -m ./models/alpaca.13b.ggmlv3.q8_0.bin \ | ||||||
|        --color \ |        --color \ | ||||||
|        -f ./prompts/alpaca.txt \ |        -f ./prompts/alpaca.txt \ | ||||||
|        --ctx_size 2048 \ |        --ctx_size 2048 \ | ||||||
|  |  | ||||||
|  | @ -1,4 +1,4 @@ | ||||||
| set(TARGET llama-baby) | set(TARGET llama-baby-llama) | ||||||
| add_executable(${TARGET} baby-llama.cpp) | add_executable(${TARGET} baby-llama.cpp) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
|  |  | ||||||
|  | @ -58,4 +58,4 @@ echo "$2 | ||||||
| model=$1 | model=$1 | ||||||
| 
 | 
 | ||||||
| # generate the most likely continuation until the string "===" is found | # generate the most likely continuation until the string "===" is found | ||||||
| ./llama -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs | ./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs | ||||||
|  |  | ||||||
|  | @ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \ | ||||||
|      $PROMPT_TEMPLATE > $PROMPT_FILE |      $PROMPT_TEMPLATE > $PROMPT_FILE | ||||||
| 
 | 
 | ||||||
| # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS | # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS | ||||||
| ./llama $GEN_OPTIONS \ | ./llama-cli $GEN_OPTIONS \ | ||||||
|   --model "$MODEL" \ |   --model "$MODEL" \ | ||||||
|   --threads "$N_THREAD" \ |   --threads "$N_THREAD" \ | ||||||
|   --n_predict "$N_PREDICTS" \ |   --n_predict "$N_PREDICTS" \ | ||||||
|  |  | ||||||
|  | @ -62,7 +62,7 @@ fi | ||||||
| if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then | if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then | ||||||
|     echo 'Prompt cache does not exist, building...' |     echo 'Prompt cache does not exist, building...' | ||||||
|     # Default batch_size to 64 here for better user feedback during initial prompt processing |     # Default batch_size to 64 here for better user feedback during initial prompt processing | ||||||
|     ./llama 2>>"$LOG" \ |     ./llama-cli 2>>"$LOG" \ | ||||||
|         --batch_size 64 \ |         --batch_size 64 \ | ||||||
|         "${OPTS[@]}" \ |         "${OPTS[@]}" \ | ||||||
|         --prompt-cache "$PROMPT_CACHE_FILE" \ |         --prompt-cache "$PROMPT_CACHE_FILE" \ | ||||||
|  | @ -109,13 +109,13 @@ while read -e line; do | ||||||
| 
 | 
 | ||||||
|     printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE" |     printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE" | ||||||
| 
 | 
 | ||||||
|     ./llama 2>>"$LOG" "${OPTS[@]}" \ |     ./llama-cli 2>>"$LOG" "${OPTS[@]}" \ | ||||||
|             --prompt-cache "$CUR_PROMPT_CACHE" \ |             --prompt-cache "$CUR_PROMPT_CACHE" \ | ||||||
|             --prompt-cache-all \ |             --prompt-cache-all \ | ||||||
|             --file "$CUR_PROMPT_FILE" \ |             --file "$CUR_PROMPT_FILE" \ | ||||||
|             --reverse-prompt "${USER_NAME}:" \ |             --reverse-prompt "${USER_NAME}:" \ | ||||||
|             --n_predict "$n_predict" | |             --n_predict "$n_predict" | | ||||||
|         skip_bytes 1 |                  # skip BOS token added by ./llama |         skip_bytes 1 |                  # skip BOS token added by ./llama-cli | ||||||
|         tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file |         tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file | ||||||
|         skip_bytes "$n_prompt_len_pre"  # print generation |         skip_bytes "$n_prompt_len_pre"  # print generation | ||||||
| 
 | 
 | ||||||
|  | @ -133,7 +133,7 @@ while read -e line; do | ||||||
|     # TODO get both messages in one go |     # TODO get both messages in one go | ||||||
|     if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" || |     if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" || | ||||||
|         ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then |         ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then | ||||||
|         echo >&2 "Couldn't get number of tokens from ./llama output!" |         echo >&2 "Couldn't get number of tokens from ./llama-cli output!" | ||||||
|         exit 1 |         exit 1 | ||||||
|     fi |     fi | ||||||
| 
 | 
 | ||||||
|  | @ -144,7 +144,7 @@ while read -e line; do | ||||||
|     fi |     fi | ||||||
| 
 | 
 | ||||||
|     # Update cache for next prompt in background, ideally during user input |     # Update cache for next prompt in background, ideally during user input | ||||||
|     ./llama >>"$LOG_BG" 2>&1 "${OPTS[@]}" \ |     ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \ | ||||||
|           --prompt-cache "$NEXT_PROMPT_CACHE" \ |           --prompt-cache "$NEXT_PROMPT_CACHE" \ | ||||||
|           --file "$NEXT_PROMPT_FILE" \ |           --file "$NEXT_PROMPT_FILE" \ | ||||||
|           --n_predict 1 & |           --n_predict 1 & | ||||||
|  |  | ||||||
|  | @ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \ | ||||||
|      $PROMPT_TEMPLATE > $PROMPT_FILE |      $PROMPT_TEMPLATE > $PROMPT_FILE | ||||||
| 
 | 
 | ||||||
| # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS | # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS | ||||||
| ./bin/llama $GEN_OPTIONS \ | ./bin/llama-cli $GEN_OPTIONS \ | ||||||
|   --model "$MODEL" \ |   --model "$MODEL" \ | ||||||
|   --threads "$N_THREAD" \ |   --threads "$N_THREAD" \ | ||||||
|   --n_predict "$N_PREDICTS" \ |   --n_predict "$N_PREDICTS" \ | ||||||
|  |  | ||||||
|  | @ -11,6 +11,6 @@ cd .. | ||||||
| # | # | ||||||
| #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt | #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt | ||||||
| # | # | ||||||
| ./llama -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \ | ./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \ | ||||||
|     --repeat_penalty 1.0 --color -i \ |     --repeat_penalty 1.0 --color -i \ | ||||||
|     -r "User:" -f prompts/chat-with-bob.txt |     -r "User:" -f prompts/chat-with-bob.txt | ||||||
|  |  | ||||||
|  | @ -25,4 +25,4 @@ Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.b | ||||||
| 
 | 
 | ||||||
| Now you can use the model with a command like: | Now you can use the model with a command like: | ||||||
| 
 | 
 | ||||||
| `$ ./llama -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` | `$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` | ||||||
|  |  | ||||||
|  | @ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s | ||||||
|         --use-checkpointing |         --use-checkpointing | ||||||
| 
 | 
 | ||||||
| # predict | # predict | ||||||
| ./bin/llama -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin | ./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| **Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`). | **Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`). | ||||||
|  | @ -45,7 +45,7 @@ In `main` you can also load multiple LORA adapters, which will then be mixed tog | ||||||
| For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this: | For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this: | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| ./bin/llama -m open-llama-3b-v2-q8_0.gguf \ | ./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ | ||||||
|   --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \ |   --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \ | ||||||
|   --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin |   --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin | ||||||
| ``` | ``` | ||||||
|  | @ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin | ||||||
| For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one: | For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one: | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| ./bin/llama -m open-llama-3b-v2-q8_0.gguf \ | ./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ | ||||||
|   --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \ |   --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \ | ||||||
|   --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \ |   --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \ | ||||||
|   --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin |   --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin | ||||||
|  |  | ||||||
|  | @ -19,7 +19,7 @@ fi | ||||||
| set -x | set -x | ||||||
| 
 | 
 | ||||||
| SPLIT=$1/gguf-split | SPLIT=$1/gguf-split | ||||||
| MAIN=$1/llama | MAIN=$1/llama-cli | ||||||
| WORK_PATH=$TMP_DIR/gguf-split | WORK_PATH=$TMP_DIR/gguf-split | ||||||
| ROOT_DIR=$(realpath $(dirname $0)/../../) | ROOT_DIR=$(realpath $(dirname $0)/../../) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ | ||||||
| cd `dirname $0` | cd `dirname $0` | ||||||
| cd .. | cd .. | ||||||
| 
 | 
 | ||||||
| ./llama --color --instruct --threads 4 \ | ./llama-cli --color --instruct --threads 4 \ | ||||||
|        --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \ |        --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \ | ||||||
|        --file ./prompts/alpaca.txt \ |        --file ./prompts/alpaca.txt \ | ||||||
|        --batch_size 8 --ctx_size 2048 -n -1 \ |        --batch_size 8 --ctx_size 2048 -n -1 \ | ||||||
|  |  | ||||||
|  | @ -21,7 +21,7 @@ counter=1 | ||||||
| echo 'Running' | echo 'Running' | ||||||
| while IFS= read -r question | while IFS= read -r question | ||||||
| do | do | ||||||
|   exe_cmd="./llama -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\"" |   exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\"" | ||||||
|   echo $counter |   echo $counter | ||||||
|   echo "Current Question: $question" |   echo "Current Question: $question" | ||||||
|   eval "$exe_cmd" |   eval "$exe_cmd" | ||||||
|  |  | ||||||
|  | @ -524,7 +524,7 @@ class SchemaConverter: | ||||||
| def main(args_in = None): | def main(args_in = None): | ||||||
|     parser = argparse.ArgumentParser( |     parser = argparse.ArgumentParser( | ||||||
|         description=''' |         description=''' | ||||||
|             Generates a grammar (suitable for use in ./llama) that produces JSON conforming to a |             Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a | ||||||
|             given JSON schema. Only a subset of JSON schema features are supported; more may be |             given JSON schema. Only a subset of JSON schema features are supported; more may be | ||||||
|             added in the future. |             added in the future. | ||||||
|         ''', |         ''', | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ | ||||||
| cd `dirname $0` | cd `dirname $0` | ||||||
| cd .. | cd .. | ||||||
| 
 | 
 | ||||||
| ./llama -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \ | ./llama-cli -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \ | ||||||
|        --color \ |        --color \ | ||||||
|        --ctx_size 2048 \ |        --ctx_size 2048 \ | ||||||
|        -n -1 \ |        -n -1 \ | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ | ||||||
| cd `dirname $0` | cd `dirname $0` | ||||||
| cd .. | cd .. | ||||||
| 
 | 
 | ||||||
| ./llama -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \ | ./llama-cli -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \ | ||||||
|        --color \ |        --color \ | ||||||
|        --ctx_size 2048 \ |        --ctx_size 2048 \ | ||||||
|        -n -1 \ |        -n -1 \ | ||||||
|  |  | ||||||
|  | @ -30,9 +30,9 @@ if(TARGET BUILD_INFO) | ||||||
|     add_dependencies(llava BUILD_INFO) |     add_dependencies(llava BUILD_INFO) | ||||||
| endif() | endif() | ||||||
| 
 | 
 | ||||||
| set(TARGET llama-llava) | set(TARGET llama-llava-cli) | ||||||
| add_executable(${TARGET} llava-cli.cpp) | add_executable(${TARGET} llava-cli.cpp) | ||||||
| set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava) | set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_11) | ||||||
|  |  | ||||||
|  | @ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM | ||||||
| Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown. | Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown. | ||||||
| 
 | 
 | ||||||
| ## Usage | ## Usage | ||||||
| Build with cmake or run `make llama-llava` to build it. | Build with cmake or run `make llama-llava-cli` to build it. | ||||||
| 
 | 
 | ||||||
| After building, run: `./llama-llava` to see the usage. For example: | After building, run: `./llama-llava-cli` to see the usage. For example: | ||||||
| 
 | 
 | ||||||
| ```sh | ```sh | ||||||
| ./llama-llava -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ | ./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \ | ||||||
|     --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \ |     --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \ | ||||||
|     --image path/to/an/image.jpg \ |     --image path/to/an/image.jpg \ | ||||||
|     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:" |     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:" | ||||||
|  | @ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path` | ||||||
| ### case 1 | ### case 1 | ||||||
| **input** | **input** | ||||||
| ```sh | ```sh | ||||||
| /data/local/tmp/llama-llava \ | /data/local/tmp/llama-llava-cli \ | ||||||
|     -m /data/local/tmp/ggml-model-q4_k.gguf \ |     -m /data/local/tmp/ggml-model-q4_k.gguf \ | ||||||
|     --mmproj /data/local/tmp/mmproj-model-f16.gguf \ |     --mmproj /data/local/tmp/mmproj-model-f16.gguf \ | ||||||
|     -t 4 \ |     -t 4 \ | ||||||
|  | @ -102,7 +102,7 @@ llama_print_timings:       total time =   34731.93 ms | ||||||
| ### case 2 | ### case 2 | ||||||
| **input** | **input** | ||||||
| ```sh | ```sh | ||||||
| /data/local/tmp/llama-llava \ | /data/local/tmp/llama-llava-cli \ | ||||||
|     -m /data/local/tmp/ggml-model-q4_k.gguf \ |     -m /data/local/tmp/ggml-model-q4_k.gguf \ | ||||||
|     --mmproj /data/local/tmp/mmproj-model-f16.gguf \ |     --mmproj /data/local/tmp/mmproj-model-f16.gguf \ | ||||||
|     -t 4 \ |     -t 4 \ | ||||||
|  | @ -126,7 +126,7 @@ llama_print_timings:       total time =   34570.79 ms | ||||||
| #### llava-cli release-b2005 | #### llava-cli release-b2005 | ||||||
| **input** | **input** | ||||||
| ```sh | ```sh | ||||||
| /data/local/tmp/llama-llava \ | /data/local/tmp/llama-llava-cli \ | ||||||
|     -m /data/local/tmp/ggml-model-q4_k.gguf \ |     -m /data/local/tmp/ggml-model-q4_k.gguf \ | ||||||
|     --mmproj /data/local/tmp/mmproj-model-f16.gguf \ |     --mmproj /data/local/tmp/mmproj-model-f16.gguf \ | ||||||
|     -t 4 \ |     -t 4 \ | ||||||
|  | @ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 | ||||||
| ### case 1 | ### case 1 | ||||||
| **input** | **input** | ||||||
| ```sh | ```sh | ||||||
| ./llama-llava \ | ./llama-llava-cli \ | ||||||
|     -m /data/local/tmp/ggml-model-q4_k.gguf \ |     -m /data/local/tmp/ggml-model-q4_k.gguf \ | ||||||
|     --mmproj /data/local/tmp/mmproj-model-f16.gguf \ |     --mmproj /data/local/tmp/mmproj-model-f16.gguf \ | ||||||
|     --image /data/local/tmp/demo.jpeg \ |     --image /data/local/tmp/demo.jpeg \ | ||||||
|  | @ -224,7 +224,7 @@ llama_print_timings:       total time =    1352.63 ms /   252 tokens | ||||||
| ### case 2 | ### case 2 | ||||||
| **input** | **input** | ||||||
| ```sh | ```sh | ||||||
| ./llama-llava \ | ./llama-llava-cli \ | ||||||
|     -m /data/local/tmp/ggml-model-q4_k.gguf \ |     -m /data/local/tmp/ggml-model-q4_k.gguf \ | ||||||
|     --mmproj /data/local/tmp/mmproj-model-f16.gguf \ |     --mmproj /data/local/tmp/mmproj-model-f16.gguf \ | ||||||
|     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \ |     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \ | ||||||
|  |  | ||||||
|  | @ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h | ||||||
| After API is confirmed, more models will be supported / uploaded. | After API is confirmed, more models will be supported / uploaded. | ||||||
| 
 | 
 | ||||||
| ## Usage | ## Usage | ||||||
| Build with cmake or run `make llama-llava` to build it. | Build with cmake or run `make llama-llava-cli` to build it. | ||||||
| 
 | 
 | ||||||
| After building, run: `./llama-llava` to see the usage. For example: | After building, run: `./llama-llava-cli` to see the usage. For example: | ||||||
| 
 | 
 | ||||||
| ```sh | ```sh | ||||||
| ./llama-llava -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg | ./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. | **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. | ||||||
|  | @ -97,7 +97,7 @@ python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknow | ||||||
| 
 | 
 | ||||||
| 7) And finally we can run the llava cli using the 1.6 model version: | 7) And finally we can run the llava cli using the 1.6 model version: | ||||||
| ```console | ```console | ||||||
| ./llama-llava -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 | ./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) | **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) | ||||||
|  |  | ||||||
|  | @ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant. | ||||||
| # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" | # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" | ||||||
| 
 | 
 | ||||||
| program_dir="build_64/bin" | program_dir="build_64/bin" | ||||||
| binName="llama-llava" | binName="llama-llava-cli" | ||||||
| n_threads=4 | n_threads=4 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,5 @@ | ||||||
| set(TARGET llama-cli) | set(TARGET llama-cli) | ||||||
| add_executable(${TARGET} main.cpp) | add_executable(${TARGET} main.cpp) | ||||||
| set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama) |  | ||||||
| install(TARGETS ${TARGET} RUNTIME) | install(TARGETS ${TARGET} RUNTIME) | ||||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | target_compile_features(${TARGET} PRIVATE cxx_std_11) | ||||||
|  |  | ||||||
|  | @ -20,7 +20,7 @@ To get started right away, run the following command, making sure to use the cor | ||||||
| #### Unix-based systems (Linux, macOS, etc.): | #### Unix-based systems (Linux, macOS, etc.): | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| ./llama -m models/7B/ggml-model.bin --prompt "Once upon a time" | ./llama-cli -m models/7B/ggml-model.bin --prompt "Once upon a time" | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| #### Windows: | #### Windows: | ||||||
|  | @ -34,7 +34,7 @@ For an interactive experience, try this command: | ||||||
| #### Unix-based systems (Linux, macOS, etc.): | #### Unix-based systems (Linux, macOS, etc.): | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| ./llama -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \ | ./llama-cli -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \ | ||||||
| 'User: Hi | 'User: Hi | ||||||
| AI: Hello. I am an AI chatbot. Would you like to talk? | AI: Hello. I am an AI chatbot. Would you like to talk? | ||||||
| User: Sure! | User: Sure! | ||||||
|  | @ -53,7 +53,7 @@ The following command generates "infinite" text from a starting prompt (you can | ||||||
| #### Unix-based systems (Linux, macOS, etc.): | #### Unix-based systems (Linux, macOS, etc.): | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| ./llama -m models/7B/ggml-model.bin --ignore-eos -n -1 | ./llama-cli -m models/7B/ggml-model.bin --ignore-eos -n -1 | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| #### Windows: | #### Windows: | ||||||
|  | @ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o | ||||||
| The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag: | The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag: | ||||||
| 
 | 
 | ||||||
| ```sh | ```sh | ||||||
| ./llama -r "User:" --in-prefix " " | ./llama-cli -r "User:" --in-prefix " " | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| ### In-Suffix | ### In-Suffix | ||||||
|  | @ -115,7 +115,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is | ||||||
| The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag: | The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag: | ||||||
| 
 | 
 | ||||||
| ```sh | ```sh | ||||||
| ./llama -r "User:" --in-prefix " " --in-suffix "Assistant:" | ./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:" | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| ## Context Management | ## Context Management | ||||||
|  |  | ||||||
|  | @ -20,7 +20,7 @@ set -x | ||||||
| 
 | 
 | ||||||
| SPLIT=$1/gguf-split | SPLIT=$1/gguf-split | ||||||
| QUANTIZE=$1/quantize | QUANTIZE=$1/quantize | ||||||
| MAIN=$1/llama | MAIN=$1/llama-cli | ||||||
| WORK_PATH=$TMP_DIR/quantize | WORK_PATH=$TMP_DIR/quantize | ||||||
| ROOT_DIR=$(realpath $(dirname $0)/../../) | ROOT_DIR=$(realpath $(dirname $0)/../../) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then | ||||||
|   MODEL="-m $2 " |   MODEL="-m $2 " | ||||||
| fi | fi | ||||||
| 
 | 
 | ||||||
| ./llama $MODEL --color \ | ./llama-cli $MODEL --color \ | ||||||
|     -f ./prompts/reason-act.txt \ |     -f ./prompts/reason-act.txt \ | ||||||
|     -i --interactive-first \ |     -i --interactive-first \ | ||||||
|     --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \ |     --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \ | ||||||
|  |  | ||||||
|  | @ -70,5 +70,5 @@ cmake --build . --config Release | ||||||
| Finally, use the `--rpc` option to specify the host and port of each `rpc-server`: | Finally, use the `--rpc` option to specify the host and port of each `rpc-server`: | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| $ bin/llama -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 | $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | @ -23,15 +23,15 @@ fi | ||||||
| if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then | if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then | ||||||
|     echo "use $GGML_SYCL_DEVICE as main GPU" |     echo "use $GGML_SYCL_DEVICE as main GPU" | ||||||
|     #use signle GPU only |     #use signle GPU only | ||||||
|     ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none |     ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none | ||||||
| else | else | ||||||
|     #use multiple GPUs with same max compute units |     #use multiple GPUs with same max compute units | ||||||
|     ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 |     ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 | ||||||
| fi | fi | ||||||
| 
 | 
 | ||||||
| #use main GPU only | #use main GPU only | ||||||
| #ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none | #ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none | ||||||
| 
 | 
 | ||||||
| #use multiple GPUs with same max compute units | #use multiple GPUs with same max compute units | ||||||
| #ZES_ENABLE_SYSMAN=1 ./build/bin/llama -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 | #ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s | ||||||
|         --no-checkpointing |         --no-checkpointing | ||||||
| 
 | 
 | ||||||
| # predict | # predict | ||||||
| ./bin/llama -m ggml-shakespeare-256x16-f32.gguf | ./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| Output files will be saved every N iterations (config with `--save-every N`). | Output files will be saved every N iterations (config with `--save-every N`). | ||||||
|  |  | ||||||
|  | @ -91,7 +91,7 @@ item ::= [^\n]+ "\n" | ||||||
| 
 | 
 | ||||||
| This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with: | This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with: | ||||||
| ``` | ``` | ||||||
| ./llama -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt' | ./llama-cli -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt' | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| ## Troubleshooting | ## Troubleshooting | ||||||
|  |  | ||||||
|  | @ -3,9 +3,9 @@ | ||||||
| # Shortcut for downloading HF models | # Shortcut for downloading HF models | ||||||
| # | # | ||||||
| # Usage: | # Usage: | ||||||
| #   ./llama -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) | #   ./llama-cli -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) | ||||||
| #   ./llama -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) | #   ./llama-cli -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) | ||||||
| #   ./llama -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) | #   ./llama-cli -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) | ||||||
| # | # | ||||||
| 
 | 
 | ||||||
| # all logs go to stderr | # all logs go to stderr | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue