Sukriti Sharma 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								784a14aa49 
								
							 
						 
						
							
							
								
								convert : add support for Roberta embeddings ( #10695 )  
							
							
							
						 
						
							2024-12-07 09:02:14 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								c5ede3849f 
								
							 
						 
						
							
							
								
								convert : add custom attention mapping  
							
							
							
						 
						
							2024-12-06 21:33:49 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Xuan Son Nguyen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								f162d45a21 
								
							 
						 
						
							
							
								
								common : bring back --no-warmup to server ( #10686 )  
							
							
							
						 
						
							2024-12-06 13:29:05 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Xuan Son Nguyen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								6c5bc0625f 
								
							 
						 
						
							
							
								
								server : (refactoring) do not rely on JSON internally ( #10643 )  
							
							... 
							
							
							
							* server : (refactoring) reduce usage of json internally
* move all response types to struct
* wip [no ci]
* many fixes
* add virtual function
* fix index
* minor style fix
* add std::move
* refactor handle_completions_generic
* add virtual functions
* remove server.hpp
* clarify server_sent_event RFC specs
* apply review comments
* fix model_alias and completion_probabilities
* small clean up
* remove virtual for to_json_oai_compat()
* naming oai_compat --> oaicompat
* fix unwanted recursive call
* update docs 
							
						 
						
							2024-12-06 11:14:32 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Plamen Minev 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								7736837d62 
								
							 
						 
						
							
							
								
								fix(server) : not show alert when DONE is received ( #10674 )  
							
							
							
						 
						
							2024-12-05 22:36:41 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Jeff Bolz 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								c9c6e01dae 
								
							 
						 
						
							
							
								
								vulkan: Add VK_NV_cooperative_matrix2 support for mul_mat and flash attention ( #10206 )  
							
							
							
						 
						
							2024-12-05 20:15:05 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Riccardo Orlando 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								6fe6247831 
								
							 
						 
						
							
							
								
								llama : add Minerva 7B model support ( #10673 )  
							
							... 
							
							
							
							* Support for Minerva 7B
* Update convert_hf_to_gguf_update.py 
							
						 
						
							2024-12-05 20:30:59 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								0cd182ebcc 
								
							 
						 
						
							
							
								
								sync : ggml  
							
							
							
						 
						
							2024-12-05 13:27:42 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									PAB 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								a8cbab201d 
								
							 
						 
						
							
							
								
								ggml: add GGML_SET Metal kernel + i32 CPU kernel (ggml/1037)  
							
							... 
							
							
							
							* implemented cpu kernel
* add i32 test cases in test-backend-ops
* typedef `ggml_metal_kargs_set`
* implemented `kernel_set`
* memcpy 
							
						 
						
							2024-12-05 13:27:33 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									PAB 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								c2082d93a8 
								
							 
						 
						
							
							
								
								ggml : add GGML_PAD_REFLECT_1D operation (ggml/1034)  
							
							... 
							
							
							
							* ggml_pad_reflect_1d defined in header
* implemented on CPU
* called the forward pass
* impl Metal kernel
* added Metal kernel
* added OP_PAD_REFLECT_1D in test-backend-ops.cpp
* add test-pad-reflect-1d test case
* test case support multiple backend 
							
						 
						
							2024-12-05 13:27:31 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Daniel Bevenius 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								d405804be8 
								
							 
						 
						
							
							
								
								py : update outdated copy-paste instructions [no ci] ( #10667 )  
							
							... 
							
							
							
							This commit updates the copy-paste instruction in
convert_hf_to_gguf_update.py to reflect that convert_hf_to_gguf.py
will have already been updated with the new get_vocab_base_pre()
function when this script completes. 
							
						 
						
							2024-12-05 09:47:55 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									aryantandon01 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								f112d198cd 
								
							 
						 
						
							
							
								
								Update deprecation-warning.cpp ( #10619 )  
							
							... 
							
							
							
							Fixed Path Separator Handling for Cross-Platform Support (Windows File Systems) 
							
						 
						
							2024-12-04 23:19:20 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								1da7b76569 
								
							 
						 
						
							
							
								
								server : fix speculative decoding with context shift ( #10641 )  
							
							... 
							
							
							
							* server : fix speculative decoding with context shift
ggml-ci
* server : take into account speculative limits
ggml-ci
* server : add tests 
							
						 
						
							2024-12-04 22:38:20 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Diego Devesa 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								59f4db1088 
								
							 
						 
						
							
							
								
								ggml : add predefined list of CPU backend variants to build ( #10626 )  
							
							... 
							
							
							
							* ggml : add predefined list of CPU backend variants to build
* update CPU dockerfiles 
							
						 
						
							2024-12-04 14:45:40 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Diego Devesa 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								2803540814 
								
							 
						 
						
							
							
								
								ggml-cpu : fix HWCAP2_I8MM value ( #10646 )  
							
							
							
						 
						
							2024-12-04 14:40:44 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									ltoniazzi 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								253b7fde91 
								
							 
						 
						
							
							
								
								Fix HF repo commit to clone lora test models ( #10649 )  
							
							
							
						 
						
							2024-12-04 10:45:48 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									JFLFY2255 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								8d0cfd554a 
								
							 
						 
						
							
							
								
								llama: Support MiniCPM-1B (with & w/o longrope) ( #10559 )  
							
							
							
						 
						
							2024-12-04 11:42:50 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Jeff Bolz 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								2759916d86 
								
							 
						 
						
							
							
								
								vulkan: Implement "fast divide" (mul+shift) for unary ops like copy ( #10642 )  
							
							
							
						 
						
							2024-12-04 08:28:59 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Nicolò Scipione 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								40c6d79fb5 
								
							 
						 
						
							
							
								
								SYCL : Move to compile time oneMKL interface backend selection for NVIDIA backend ( #10584 )  
							
							... 
							
							
							
							* [SYCL] Move to Compile Time backend selection on oneMKL Interface for NVIDIA backend
Move to compile time selection to backend to avoid latency at run time.
Add it to all mkl gemm calls and only for NVIDIA backend.
Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
* Formatting
* Address PR comments to increase readibility
---------
Signed-off-by: nscipione <nicolo.scipione@codeplay.com> 
							
						 
						
							2024-12-04 09:29:20 +08:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Wang Ran (汪然) 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								98036d5670 
								
							 
						 
						
							
							
								
								fix typo of README.md ( #10605 )  
							
							
							
						 
						
							2024-12-04 02:22:50 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Frankie Robertson 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								cd2f37b304 
								
							 
						 
						
							
							
								
								Avoid using __fp16 on ARM with old nvcc ( #10616 )  
							
							
							
						 
						
							2024-12-04 01:41:37 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Benson Wong 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								da6aac91f1 
								
							 
						 
						
							
							
								
								Add docs for creating a static build ( #10268 ) ( #10630 )  
							
							... 
							
							
							
							* Add notes for a static build
* Update docs/build.md
---------
Co-authored-by: Diego Devesa <slarengh@gmail.com> 
							
						 
						
							2024-12-04 01:40:36 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									piDack 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								01e6d9bb71 
								
							 
						 
						
							
							
								
								clip : add sycl support ( #10574 )  
							
							... 
							
							
							
							Co-authored-by: piDack <pcdack@hotmail.co> 
							
						 
						
							2024-12-04 01:26:37 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Jeff Bolz 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								cc98896db8 
								
							 
						 
						
							
							
								
								vulkan: optimize and reenable split_k ( #10637 )  
							
							... 
							
							
							
							Use vector loads when possible in mul_mat_split_k_reduce. Use split_k
when there aren't enough workgroups to fill the shaders. 
							
						 
						
							2024-12-03 20:29:54 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Xuan Son Nguyen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								91c36c269b 
								
							 
						 
						
							
							
								
								server : (web ui) Various improvements, now use vite as bundler ( #10599 )  
							
							... 
							
							
							
							* hide buttons in dropdown menu
* use npm as deps manager and vite as bundler
* fix build
* fix build (2)
* fix responsive on mobile
* fix more problems on mobile
* sync build
* (test) add CI step for verifying build
* fix ci
* force rebuild .hpp files
* cmake: clean up generated files pre build 
							
						 
						
							2024-12-03 19:38:44 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
							
							
								
							
							
								1cd3df46bd 
								
							 
						 
						
							
							
								
								scripts : remove amx sync  
							
							... 
							
							
							
							ggml-ci 
							
						 
						
							2024-12-03 20:04:49 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
							
							
								
							
							
								c505471857 
								
							 
						 
						
							
							
								
								sync : ggml  
							
							
							
						 
						
							2024-12-03 20:04:49 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									mahorozte 
								
							 
						 
						
							
							
							
							
								
							
							
								e9e661bd59 
								
							 
						 
						
							
							
								
								CUDA: remove unnecessary warp reduce in FA (ggml/1032)  
							
							... 
							
							
							
							* kqmax_new_j in every thread within warp is same after operate at line 199,this reduce can be omit
* same problem in vec32
---------
Co-authored-by: ZhaoXiaoYu <zhao.xiaoyu@zte.com.cn> 
							
						 
						
							2024-12-03 20:04:49 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									PAB 
								
							 
						 
						
							
							
							
							
								
							
							
								efb6ae9630 
								
							 
						 
						
							
							
								
								feat: add GGML_UNARY_OP_ARGMAX Metal kernel (ggml/1019)  
							
							... 
							
							
							
							* implemented argmax kernel
* tpig -> tgpig
* change to strides
* contiguous assertions
* kernel working and tested
* argmax simd parallel implementation
* added 2 new tests for argmax in test-backend-ops
* cosmit
* added 3 tests cases for perf eval
* add test_argmax in make_test_cases_perf
* Update test-backend-ops.cpp
Co-authored-by: Diego Devesa <slarengh@gmail.com>
---------
Co-authored-by: Diego Devesa <slarengh@gmail.com> 
							
						 
						
							2024-12-03 20:04:49 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									PAB 
								
							 
						 
						
							
							
							
							
								
							
							
								667d70d170 
								
							 
						 
						
							
							
								
								metal : add GGML_OP_CONV_TRANSPOSE_1D kernels (ggml/1026)  
							
							... 
							
							
							
							* wip
* wip implementation f32
* kernel conv transpose 1d f32 working
* initial commit 
							
						 
						
							2024-12-03 20:04:49 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Xuan Son Nguyen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								3b4f2e33e2 
								
							 
						 
						
							
							
								
								llama : add missing LLAMA_API for llama_chat_builtin_templates ( #10636 )  
							
							
							
						 
						
							2024-12-03 12:54:30 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Nikolaos Pothitos 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								82bca2257b 
								
							 
						 
						
							
							
								
								readme : add option, update default value, fix formatting ( #10271 )  
							
							... 
							
							
							
							* readme : document --no-display-prompt
* readme : update default prompt context size
* readme : remove unnecessary indentation
Indenting a line with four spaces makes Markdown treat that section as
plain text.
* readme : indent commands under bullets
* readme : indent commands in lettered list 
							
						 
						
							2024-12-03 12:50:08 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								0115df2f65 
								
							 
						 
						
							
							
								
								metal : small-batch mat-mul kernels ( #10581 )  
							
							... 
							
							
							
							* metal : small-batch mat-mul kernels
ggml-ci
* metal : add rest of types
ggml-ci
* metal : final adjustments
ggml-ci
* metal : add comments
ggml-ci 
							
						 
						
							2024-12-03 11:52:33 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								515d4e5372 
								
							 
						 
						
							
							
								
								github : minify link [no ci] (revert)  
							
							... 
							
							
							
							this doesn't work as expected 
							
						 
						
							2024-12-03 11:21:43 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								844e2e1fee 
								
							 
						 
						
							
							
								
								github : minify link [no ci]  
							
							
							
						 
						
							2024-12-03 11:20:35 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								70b98fadbc 
								
							 
						 
						
							
							
								
								server : fix default draft model parameters ( #10586 )  
							
							... 
							
							
							
							* server : force F16 KV cache for the draft model
ggml-ci
* server : fix draft params
ggml-ci
* server : various params fixes
ggml-ci 
							
						 
						
							2024-12-03 11:20:00 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Xuan Son Nguyen 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								642330ac7c 
								
							 
						 
						
							
							
								
								llama : add enum for built-in chat templates ( #10623 )  
							
							... 
							
							
							
							* llama : add enum for supported chat templates
* use "built-in" instead of "supported"
* arg: print list of built-in templates
* fix test
* update server README 
							
						 
						
							2024-12-02 22:10:19 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								8648c52101 
								
							 
						 
						
							
							
								
								make : deprecate ( #10514 )  
							
							... 
							
							
							
							* make : deprecate
ggml-ci
* ci : disable Makefile builds
ggml-ci
* docs : remove make references [no ci]
* ci : disable swift build
ggml-ci
* docs : remove obsolete make references, scripts, examples
ggml-ci
* basic fix for compare-commits.sh
* update build.md
* more build.md updates
* more build.md updates
* more build.md updates
* Update Makefile
Co-authored-by: Diego Devesa <slarengh@gmail.com>
---------
Co-authored-by: slaren <slarengh@gmail.com> 
							
						 
						
							2024-12-02 21:22:53 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									haopeng 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								64ed2091b2 
								
							 
						 
						
							
							
								
								server: Add "tokens per second" information in the backend ( #10548 )  
							
							... 
							
							
							
							* add cmake rvv support
* add timings
* remove space
* update readme
* fix
* fix code
* remove empty line
* add test
---------
Co-authored-by: Xuan Son Nguyen <son@huggingface.co> 
							
						 
						
							2024-12-02 14:45:54 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Akarshan Biswas 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								991f8aabee 
								
							 
						 
						
							
							
								
								SYCL: Fix and switch to GGML_LOG system instead of fprintf ( #10579 )  
							
							... 
							
							
							
							* Switched to GGML_LOG
* Fix missing semicolon 
							
						 
						
							2024-12-02 15:04:11 +08:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								4cb003dd8d 
								
							 
						 
						
							
							
								
								contrib : refresh ( #10593 )  
							
							... 
							
							
							
							* contrib : refresh
* contrib : expand [no ci]
* contrib : expand test-backend-ops instructions
* contrib : add CODEOWNERS
* prs : update template to not have checkbox [no ci] 
							
						 
						
							2024-12-02 08:53:27 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Juk Armstrong 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								917786f43d 
								
							 
						 
						
							
							
								
								Add mistral-v1, mistral-v3, mistral-v3-tekken and mistral-v7 chat template types ( #10572 )  
							
							... 
							
							
							
							* Templates: `mistral-v1`, `mistral-v2`, `mistral-v3`, `mistral-v3-tekken`
* Changed system message logic and added tests for all 4
* Invalid `system_message` instead of `content` fixed
* Removed tab-indented lines
* Added template code and test for `mistral-v7`
* Added all tests. Fixed bug with `tmpl == "llama2"` test.
* Replaced tabs with spaces.
* Removed `'mistral-v2'` option as no (open) models ever used it
* Removed all references to 'v2' template from comments
* Update llama.cpp
Fixed `trim_assistant_message` bug 
							
						 
						
							2024-12-01 23:09:49 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								5e1ed95583 
								
							 
						 
						
							
							
								
								grammars : add English-only grammar ( #10612 )  
							
							
							
						 
						
							2024-12-01 21:37:54 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Wang Qin 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								5c7a5aa0c3 
								
							 
						 
						
							
							
								
								ci: add error handling for Python venv creation in run.sh ( #10608 )  
							
							
							
						 
						
							2024-12-01 20:11:42 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Diego Devesa 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								3420909dff 
								
							 
						 
						
							
							
								
								ggml : automatic selection of best CPU backend ( #10606 )  
							
							... 
							
							
							
							* ggml : automatic selection of best CPU backend
* amx : minor opt
* add GGML_AVX_VNNI to enable avx-vnni, fix checks 
							
						 
						
							2024-12-01 16:12:41 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									alek3y 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								86dc11c5bc 
								
							 
						 
						
							
							
								
								server : bind to any port when specified ( #10590 )  
							
							
							
						 
						
							2024-12-01 13:33:12 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								6acce39710 
								
							 
						 
						
							
							
								
								readme : update the usage section with examples ( #10596 )  
							
							... 
							
							
							
							* readme : update the usage section with examples
* readme : more examples 
							
						 
						
							2024-12-01 11:25:17 +02:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Wang Qin 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								43957ef203 
								
							 
						 
						
							
							
								
								build: update Makefile comments for C++ version change ( #10598 )  
							
							
							
						 
						
							2024-12-01 04:19:44 +01:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Adrien Gallouët 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								0c39f44d70 
								
							 
						 
						
							
							
								
								ggml-cpu: replace AArch64 NEON assembly with intrinsics in ggml_gemv_q4_0_4x4_q8_0() ( #10567 )  
							
							... 
							
							
							
							Signed-off-by: Adrien Gallouët <angt@huggingface.co> 
							
						 
						
							2024-11-30 09:13:18 -08:00 
							
								 
							
						 
					 
				
					
						
							
								
								
									Georgi Gerganov 
								
							 
						 
						
							
							
								
								
							
							
							
								
							
							
								3e0ba0e604 
								
							 
						 
						
							
							
								
								readme : remove old badge  
							
							
							
						 
						
							2024-11-30 10:09:21 +02:00