diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index f63860d14..276a217d4 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -17,7 +17,7 @@ jobs: steps: - uses: actions/stale@v5 with: - exempt-issue-labels: "refactor,help wanted,good first issue,research,bug" + exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap" days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" diff --git a/AUTHORS b/AUTHORS index 2eb60806a..6796b2941 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,4 +1,4 @@ -# date: Thu Nov 28 20:46:15 EET 2024 +# date: Tue Feb 4 13:04:05 EET 2025 # this file is auto-generated by scripts/gen-authors.sh 0cc4m @@ -20,6 +20,8 @@ Adithya Balaji AdithyanI Adrian Adrian Hesketh +Adrien Gallouët +Adrien Gallouët Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com> Ahmet Zeer AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> @@ -55,6 +57,7 @@ Ananta Bastola Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> András Salamon Andreas (Andi) Kunar +Andreas Kieslinger <47689530+aendk@users.noreply.github.com> Andrei Andrew Canis Andrew Downing @@ -91,13 +94,17 @@ Ben Siraphob Ben Williams Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com> Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> +Benson Wong Bernat Vadell +Bernhard M. Wiedemann Bert Wagner +Billel Mokeddem Bingan <70050083+binganao@users.noreply.github.com> Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com> Bodo Graumann Bono Lv Borislav Stanimirov +Borislav Stanimirov Branden Butler Brandon Squizzato <35474886+bsquizz@users.noreply.github.com> Brian @@ -117,6 +124,7 @@ Casey Primozic Casey Primozic CausalLM <148736309+CausalLM@users.noreply.github.com> Cebtenzzre +CentricStorm Chad Brewbaker Changyeon Kim Chao Jiang @@ -131,12 +139,15 @@ Chris Kuehl Christian Demsar Christian Demsar Christian Falch <875252+chrfalch@users.noreply.github.com> +Christian Kastner Christian Kögler Christian Köhnenkamp Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> +Christopher Nielsen <62156882+mascguy@users.noreply.github.com> Clark Saben <76020733+csaben@users.noreply.github.com> Clint Herron Conrad Kramer +Corentin REGAL CrispStrobe <154636388+CrispStrobe@users.noreply.github.com> Csaba Kecskemeti Cuong Trinh Manh @@ -176,6 +187,7 @@ Dibakar Gope Didzis Gosko Diego Devesa Diogo Teles Sant'Anna +Djip007 <3705339+Djip007@users.noreply.github.com> Djip007 Don Mahurin DooWoong Lee (David) @@ -193,6 +205,7 @@ Edward Taylor Elaine Elbios <141279586+Elbios@users.noreply.github.com> Elton Kola +Emreerdog <34742675+Emreerdog@users.noreply.github.com> Engininja2 <139037756+Engininja2@users.noreply.github.com> Equim Eric Curtin @@ -233,6 +246,7 @@ Fred Douglas <43351173+fredlas@users.noreply.github.com> Frederik Vogel Gabe Goodhart Gabe Goodhart +Gaetan Bisson GainLee Galunid Gary Linscott @@ -249,6 +263,7 @@ Guillaume "Vermeille" Sanchez Guillaume Wenzek Guoliang Hua <32868157+nbcsm@users.noreply.github.com> Guoteng <32697156+SolenoidWGT@users.noreply.github.com> +Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com> Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> Haggai Nuchi Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com> @@ -259,11 +274,13 @@ Haoxiang Fei Harald Fernengel Hatsune Miku <129688334+at8u@users.noreply.github.com> HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com> +Haus1 Henk Poley Henri Vasserman Henrik Forstén Herman Semenov Hesen Peng +HimariO Hoang Nguyen Hong Bo PENG Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> @@ -280,6 +297,7 @@ Icecream95 Ido S IgnacioFDM Igor Okulist +Ihar Hrachyshka Ikko Eltociear Ashimine Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> Ionoclast Laboratories @@ -289,12 +307,14 @@ Ivan Ivan Filipov <159561759+vanaka11@users.noreply.github.com> Ivan Komarov Ivan Stepanov +JFLFY2255 JH23X <165871467+JH23X@users.noreply.github.com> Jack Mousseau Jack Mousseau JackJollimore <130917767+JackJollimore@users.noreply.github.com> Jaeden Amero Jaemin Son +Jafar Uruç Jag Chadha Jakub N James A Capozzoli <157492257+jac-jim@users.noreply.github.com> @@ -315,6 +335,7 @@ Jeffrey Morgan Jeffrey Quesnelle Jeroen Mostert Jesse Jojo Johnson +Jett Janiak Jeximo Jhen-Jie Hong Jiahao Li @@ -343,6 +364,7 @@ Josh Ramer Joyce Juan Calderon-Perez <835733+gaby@users.noreply.github.com> Judd +Juk Armstrong <69222624+jukofyork@users.noreply.github.com> Julius Arkenberg Jun Hee Yoo Jun Jie <71215065+junnjiee16@users.noreply.github.com> @@ -357,6 +379,7 @@ Justine Tunney Juuso Alasuutari KASR Kamil Tomšík +Karol Kontny <82021046+kkontny@users.noreply.github.com> Karsten Weiss Karthick Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com> @@ -376,6 +399,7 @@ Kolen Cheung Konstantin Herud Konstantin Zhuravlyov Kunshang Ji +Kyle Bruene Kyle Liang Kyle Mistele Kylin <56434533+KyL0N@users.noreply.github.com> @@ -394,6 +418,7 @@ Liu Jia LoganDark Loïc Carrère LostRuins <39025047+LostRuins@users.noreply.github.com> +LostRuins Concedo <39025047+LostRuins@users.noreply.github.com> Luciano Luo Tian Lyle Dean @@ -423,6 +448,7 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com> Mateusz Charytoniuk Matheus C. França Matheus Gabriel Alves Silva +Mathieu Baudier Mathieu Geli Mathieu Nayrolles Mathijs Henquet @@ -444,6 +470,7 @@ Meng, Hengyu Mengqing Cao Merrick Christensen Michael Coppola +Michael Engel Michael Francis Michael Hueschen Michael Kesper @@ -452,7 +479,9 @@ Michael Podvitskiy Michael Potter Michael de Gans Michaël de Vries +Michał Moskal Michał Tuszyński +Michelle Tan <41475767+MichelleTanPY@users.noreply.github.com> Mihai Mike Mikko Juola @@ -477,6 +506,7 @@ Neo Zhang <14088817+arthw@users.noreply.github.com> Neo Zhang Neo Zhang Jianyu Neuman Vong +NeverLucky <92274250+nvrxq@users.noreply.github.com> Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Nexesenex <124105151+Nexesenex@users.noreply.github.com> Niall Coates <1349685+Niall-@users.noreply.github.com> @@ -484,11 +514,15 @@ Nicholai Tukanov Nico Bosshard Nicolai Weitkemper Nicolás Pérez +Nicolò Scipione Nigel Bosch +Nikita Sarychev <42014488+sARY77@users.noreply.github.com> Niklas Korz NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com> +Nikolaos Pothitos Nikolas <127742645+nneubacher@users.noreply.github.com> Nindaleth +Nuno OSecret <135510162+OLSecret@users.noreply.github.com> Oleksandr Nikitin Oleksii Maryshchenko @@ -504,6 +538,7 @@ Pavel Zloi Pavol Rusnak Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com> Pedro Cuenca +Peter Peter Sugihara Phil H <5756783+phiharri@users.noreply.github.com> Philip Taron @@ -529,9 +564,12 @@ Rand Xie Randall Fitzgerald Random Fly Reinforce-II +Rémy Oudompheng Ren Xuancheng Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> +Reza Kakhki RhinoDevel +Riccardo Orlando Riceball LEE Rich Dougherty Richard Kiss @@ -544,6 +582,8 @@ Riley Stewart Rinne Rinne Robert Brisita <986796+rbrisita@users.noreply.github.com> +Robert Collins +Robert Ormandi <52251610+ormandi@users.noreply.github.com> Robert Sung-wook Shin Robey Holderith Robyn @@ -559,7 +599,9 @@ Roni Ronny Brendel Ronsor Rowan Hart +Ruan <47767371+ruanych@users.noreply.github.com> Ruchira Hasaranga +Rudi Servo Ruixin Huang <18860020911@163.com> Rune <43761327+Rune-AI@users.noreply.github.com> RunningLeon @@ -623,12 +665,14 @@ Steven Roussey Steward Garcia <57494570+FSSRepo@users.noreply.github.com> StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com> Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> +Sukriti Sharma SuperUserNameMan Sutou Kouhei Tai Duc Nguyen Taikono-Himazin Tameem <113388789+AhmadTameem@users.noreply.github.com> Tamotsu Takahashi +Tei Home Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com> Thatcher Chamberlin Theia Vogel @@ -640,6 +684,7 @@ Tim Miller Tim Wang Timmy Knight Timothy Cronin <40186632+4imothy@users.noreply.github.com> +Ting Lou Ting Lou Ting Sun Tobias Lütke @@ -661,6 +706,7 @@ Uzo Nweke Vaibhav Srivastav Val Kharitonov Valentin Konovalov +Valentin Mamedov <45292985+Inf1delis@users.noreply.github.com> Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> Vali Malinoiu <0x4139@gmail.com> Victor Nogueira @@ -673,13 +719,17 @@ Vladimir Malyutin Vladimir Zorin VoidIsVoid <343750470@qq.com> Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com> +Wang Qin <37098874+wangqin0@users.noreply.github.com> +Wang Ran (汪然) WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com> Weird Constructor Welby Seely Wentai Zhang WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com> William Tambellini +William Tambellini Willy Tarreau +Woof Dog <197125663+woof-dog@users.noreply.github.com> Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com> Wu Jian Ping Wu Jian Ping @@ -692,6 +742,7 @@ Xie Yanbo Xingchen Song(宋星辰) Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com> Xuan Son Nguyen +Xuan-Son Nguyen Yaiko Yann Follet <131855179+YannFollet@users.noreply.github.com> Yaroslav @@ -702,7 +753,9 @@ Yoshi Suhara Yoshi Suhara Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> +Yüg Yui +Yun Dou Yuri Khrustalev Yusuf Kağan Hanoğlu Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com> @@ -714,18 +767,23 @@ Zhang Peiyuan Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com> Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> Zhiyuan Li +Zhiyuan Li ZhouYuChen Ziad Ben Hadj-Alouane Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> Zsapi a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com> +a3sh <38979186+A3shTnT@users.noreply.github.com> adel boussaken afrideva <95653597+afrideva@users.noreply.github.com> +ag2s20150909 <19373730+ag2s20150909@users.noreply.github.com> agray3 akawrykow <142945436+akawrykow@users.noreply.github.com> +alek3y <44779186+alek3y@users.noreply.github.com> alexpinel <93524949+alexpinel@users.noreply.github.com> alonfaraj alwqx +amd-dwang amd-lalithnc amritahs-ibm andrijdavid @@ -737,6 +795,7 @@ arch-btw <57669023+arch-btw@users.noreply.github.com> arcrank ardfork <134447697+ardfork@users.noreply.github.com> arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> +aryantandon01 <80969509+aryantandon01@users.noreply.github.com> at8u <129688334+at8u@users.noreply.github.com> automaticcat awatuna <23447591+awatuna@users.noreply.github.com> @@ -751,12 +810,14 @@ bryanSwk <93190252+bryanSwk@users.noreply.github.com> bsilvereagle bssrdf byte-6174 <88070277+byte-6174@users.noreply.github.com> +cduk <19917266+cduk@users.noreply.github.com> cebtenzzre chaihahaha chiranko <96988916+chiranko@users.noreply.github.com> clibdev <52199778+clibdev@users.noreply.github.com> clyang cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com> +codezjx coezbek comex compilade <113953597+compilade@users.noreply.github.com> @@ -780,14 +841,17 @@ drbh ds5t5 <145942675+ds5t5@users.noreply.github.com> dylan eastriver +ebraminio ebraminio eiery <19350831+eiery@users.noreply.github.com> eric8607242 fairydreaming <166155368+fairydreaming@users.noreply.github.com> fengerhu1 <2748250768@qq.com> +fj-y-saito <85871716+fj-y-saito@users.noreply.github.com> fraxy-v <65565042+fraxy-v@users.noreply.github.com> github-actions[bot] gliptic +gn64 goerch grahameth <96447521+grahameth@users.noreply.github.com> gtygo @@ -812,10 +876,12 @@ icppWorld <124377669+icppWorld@users.noreply.github.com> igarnier intelmatt <61025942+intelmatt@users.noreply.github.com> iohub +issixx <46835150+issixx@users.noreply.github.com> jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> jameswu2014 <545426914@qq.com> jdomke <28772296+jdomke@users.noreply.github.com> +jiahao su jiez <373447296@qq.com> jneem joecryptotoo <80373433+joecryptotoo@users.noreply.github.com> @@ -828,6 +894,7 @@ junchao-loongson <68935141+junchao-loongson@users.noreply.github.com> jwj7140 <32943891+jwj7140@users.noreply.github.com> k.h.lai kaizau +kallewoof kalomaze <66376113+kalomaze@users.noreply.github.com> kang katsu560 <118887472+katsu560@users.noreply.github.com> @@ -835,6 +902,7 @@ kchro3 <62481661+kchro3@users.noreply.github.com> khimaros kiltyj klosax <131523366+klosax@users.noreply.github.com> +krystiancha kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> kunnis kuronekosaiko @@ -847,6 +915,8 @@ ldwang le.chang leejet leo-pony +lexasub +lhez limitedAtonement liuwei-git <14815172+liuwei-git@users.noreply.github.com> lon <114724657+longregen@users.noreply.github.com> @@ -855,10 +925,13 @@ ltoniazzi <61414566+ltoniazzi@users.noreply.github.com> luoyu-intel m3ndax maddes8cht <55592906+maddes8cht@users.noreply.github.com> +mahorozte <41834471+mahorozte@users.noreply.github.com> makomk manikbhandari maor-ps <154728172+maor-ps@users.noreply.github.com> +mashdragon <122402293+mashdragon@users.noreply.github.com> matiaslin <45382001+matiaslin@users.noreply.github.com> +matt23654 matteo mdrokz mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> @@ -868,6 +941,7 @@ mmyjona momonga <115213907+mmnga@users.noreply.github.com> momonga <146910567+mmngays@users.noreply.github.com> moritzbrantner <31051084+moritzbrantner@users.noreply.github.com> +musoles <135031143+musoles@users.noreply.github.com> mzcu nanahi <130121847+na-na-hi@users.noreply.github.com> ngc92 <7938269+ngc92@users.noreply.github.com> @@ -885,6 +959,7 @@ oobabooga <112222186+oobabooga@users.noreply.github.com> opparco ostix360 <55257054+ostix360@users.noreply.github.com> pculliton +peidaqi pengxin99 perserk piDack <104877312+piDack@users.noreply.github.com> @@ -892,10 +967,12 @@ pmysl postmasters pudepiedj qingfengfenga <41416092+qingfengfenga@users.noreply.github.com> +qingy1337 qouoq qunash rabidcopy rankaiyx +redbeard rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com> rhuddleston rimoliga <53384203+rimoliga@users.noreply.github.com> @@ -912,6 +989,7 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com> slaren <2141330+slaren@users.noreply.github.com> slaren snadampal <87143774+snadampal@users.noreply.github.com> +someone13574 <81528246+someone13574@users.noreply.github.com> standby24x7 staviq stduhpf @@ -931,6 +1009,7 @@ uint256_t uint256_t unbounded uvos +uvos valiray <133289098+valiray@users.noreply.github.com> vb vik @@ -951,6 +1030,7 @@ xaedes xctan xloem <0xloem@gmail.com> yangli2 +ymcki <84055651+ymcki@users.noreply.github.com> yuiseki yuri@FreeBSD zakkor @@ -963,4 +1043,5 @@ zrm 杨朱 · Kiki 源文雨 <41315874+fumiama@users.noreply.github.com> 蕭澧邦 <45505768+shou692199@users.noreply.github.com> +谢乃闻 Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> diff --git a/README.md b/README.md index 7f306d199..d68330d2a 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) +- Rust (automated build from crates.io): [ShelbyJenkins/llm_client](https://github.com/ShelbyJenkins/llm_client) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) diff --git a/common/arg.cpp b/common/arg.cpp index dd0cc51d9..7f2b8e31c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1465,15 +1465,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--list-devices"}, "print list of available devices and exit", [](common_params &) { - printf("Available devices:\n"); + std::vector rpc_devices; + std::vector all_devices; for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { auto * dev = ggml_backend_dev_get(i); if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { - size_t free, total; - ggml_backend_dev_memory(dev, &free, &total); - printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (ggml_backend_reg_name(reg) == std::string("RPC")) { + rpc_devices.push_back(dev); + } else { + all_devices.push_back(dev); + } } } + // insert RPC devices in front + all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end()); + printf("Available devices:\n"); + for (size_t i = 0; i < all_devices.size(); ++i) { + auto * dev = all_devices[i]; + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); + } exit(0); } )); diff --git a/common/chat.cpp b/common/chat.cpp index f87583d85..4a113c0ca 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -365,7 +365,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ return data; } static common_chat_msg common_chat_parse_command_r7b(const std::string & input) { - static std::regex response_regex("<\\|START_RESPONSE\\|>(.*?)<\\|END_RESPONSE\\|>"); + static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>"); static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>"); std::smatch match; diff --git a/common/common.cpp b/common/common.cpp index 8d6e8b0cb..4f7d5a5b4 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1932,11 +1932,19 @@ std::string common_chat_format_example(const common_chat_templates & tmpl, bool return common_chat_apply_template(tmpl, msgs, true, use_jinja); } +#define CHATML_TEMPLATE_SRC \ + "{%- for message in messages -%}\n" \ + " {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \ + "{%- endfor -%}\n" \ + "{%- if add_generation_prompt -%}\n" \ + " {{- '<|im_start|>assistant\n' -}}\n" \ + "{%- endif -%}" + common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override) { - auto vocab = llama_model_get_vocab(model); - std::string default_template_src = chat_template_override; - std::string template_tool_use_src = chat_template_override; + std::string default_template_src; + std::string template_tool_use_src; + bool has_explicit_template = !chat_template_override.empty(); if (chat_template_override.empty()) { auto str = llama_model_chat_template(model, /* name */ nullptr); @@ -1949,21 +1957,21 @@ common_chat_templates common_chat_templates_from_model(const struct llama_model template_tool_use_src = str; has_explicit_template = true; } + } else { + default_template_src = chat_template_override; } if (default_template_src.empty() || default_template_src == "chatml") { if (!template_tool_use_src.empty()) { default_template_src = template_tool_use_src; } else { - default_template_src = R"( - {%- for message in messages -%} - {{- "<|im_start|>" + message.role + "\n" + message.content + "<|im_end|>\n" -}} - {%- endfor -%} - {%- if add_generation_prompt -%} - {{- "<|im_start|>assistant\n" -}} - {%- endif -%} - )"; + default_template_src = CHATML_TEMPLATE_SRC; } } + std::string token_bos; + std::string token_eos; + // TODO: update logic that adds BOS and EOS tokens to the tokenized prompt, in favour of the template. +#if 0 + auto vocab = llama_model_get_vocab(model); const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) { if (token == LLAMA_TOKEN_NULL) { if (default_template_src.find(jinja_variable_name) != std::string::npos @@ -1975,15 +1983,25 @@ common_chat_templates common_chat_templates_from_model(const struct llama_model return common_token_to_piece(vocab, token, true); } }; - auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token"); - auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token"); - return { - has_explicit_template, - std::make_unique(default_template_src, token_bos, token_eos), - template_tool_use_src.empty() - ? nullptr - : std::make_unique(template_tool_use_src, token_bos, token_eos) - }; + token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token"); + token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token"); +#endif + try { + return { + has_explicit_template, + std::make_unique(default_template_src, token_bos, token_eos), + template_tool_use_src.empty() + ? nullptr + : std::make_unique(template_tool_use_src, token_bos, token_eos), + }; + } catch (const std::exception & e) { + LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what()); + return { + has_explicit_template, + std::make_unique(CHATML_TEMPLATE_SRC, token_bos, token_eos), + nullptr, + }; + } } // diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 371917b2e..55c31166c 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -31,6 +31,11 @@ defer { llama_model_free(model) } +guard let vocab = llama_model_get_vocab(model) else { + print("Failed to get vocab") + exit(1) +} + var tokens = tokenize(text: prompt, add_bos: true) let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel) @@ -41,7 +46,7 @@ context_params.n_batch = UInt32(max(n_len, n_parallel)) context_params.n_threads = 8 context_params.n_threads_batch = 8 -let context = llama_new_context_with_model(model, context_params) +let context = llama_init_from_model(model, context_params) guard context != nil else { print("Failed to initialize context") exit(1) @@ -141,7 +146,7 @@ while n_cur <= n_len { let new_token_id = llama_sampler_sample(smpl, context, i_batch[i]) // is it an end of stream? -> mark the stream as finished - if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len { + if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len { i_batch[i] = -1 // print("") if n_parallel > 1 { @@ -207,7 +212,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count let n_tokens = utf8Count + (add_bos ? 1 : 0) let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) + let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) var swiftTokens: [llama_token] = [] for i in 0 ..< tokenCount { swiftTokens.append(tokens[Int(i)]) @@ -218,12 +223,12 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] { private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? { var result = [CChar](repeating: 0, count: 8) - let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false) + let nTokens = llama_token_to_piece(vocab, token, &result, Int32(result.count), 0, false) if nTokens < 0 { let actualTokensCount = -Int(nTokens) result = .init(repeating: 0, count: actualTokensCount) let check = llama_token_to_piece( - model, + vocab, token, &result, Int32(result.count), diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 477c3e6f2..ee7141a66 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama actor LlamaContext { private var model: OpaquePointer private var context: OpaquePointer + private var vocab: OpaquePointer private var sampling: UnsafeMutablePointer private var batch: llama_batch private var tokens_list: [llama_token] @@ -47,6 +48,7 @@ actor LlamaContext { self.sampling = llama_sampler_chain_init(sparams) llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4)) llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234)) + vocab = llama_model_get_vocab(model) } deinit { @@ -79,7 +81,7 @@ actor LlamaContext { ctx_params.n_threads = Int32(n_threads) ctx_params.n_threads_batch = Int32(n_threads) - let context = llama_new_context_with_model(model, ctx_params) + let context = llama_init_from_model(model, ctx_params) guard let context else { print("Could not load context!") throw LlamaError.couldNotInitializeContext @@ -151,7 +153,7 @@ actor LlamaContext { new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1) - if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len { + if llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len { print("\n") is_done = true let new_token_str = String(cString: temporary_invalid_cchars + [0]) @@ -297,7 +299,7 @@ actor LlamaContext { let utf8Count = text.utf8.count let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1 let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false) + let tokenCount = llama_tokenize(vocab, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false) var swiftTokens: [llama_token] = [] for i in 0...allocate(capacity: Int(-nTokens)) @@ -324,7 +326,7 @@ actor LlamaContext { defer { newResult.deallocate() } - let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false) + let nNewTokens = llama_token_to_piece(vocab, token, newResult, -nTokens, 0, false) let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens)) return Array(bufferPointer) } else { diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz index 3a2529aa2..e876776e8 100644 Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3ceba0558..be1a27c20 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3353,6 +3353,8 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp return; } + // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch + LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); LOG_DBG("request: %s\n", req.body.c_str()); @@ -3439,9 +3441,13 @@ int main(int argc, char ** argv) { message = "Unknown Exception"; } - json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); - LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); - res_error(res, formatted_error); + try { + json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); + LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); + res_error(res, formatted_error); + } catch (const std::exception & e) { + LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str()); + } }); svr->set_error_handler([&res_error](const httplib::Request &, httplib::Response & res) { diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index f5d8b0572..f23d5cff4 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -13,9 +13,12 @@ def create_server(): @pytest.mark.parametrize( "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template", [ + (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", False, None), + (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", True, None), (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", False, None), - (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, None), - (None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"), + (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, None), + (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, 'chatml'), + (None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"), ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None), ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None), ] diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py index e6ed9c9be..4a551404f 100644 --- a/examples/server/tests/unit/test_tool_call.py +++ b/examples/server/tests/unit/test_tool_call.py @@ -67,8 +67,8 @@ WEATHER_TOOL = { def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None): - n_predict = 512 global server + n_predict = 512 # server = ServerPreset.stories15m_moe() server.jinja = True server.n_predict = n_predict @@ -139,29 +139,49 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, @pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [ (TEST_TOOL, "success", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), + (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), + + # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. (TEST_TOOL, "success", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), + (TEST_TOOL, "success", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), + (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), + (TEST_TOOL, "success", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), + (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), + (TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), + (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), + (TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), + (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), + (TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), - (TEST_TOOL, "success", "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)), - (PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)), + (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), + + (TEST_TOOL, "success", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)), + (PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)), + (PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", "chatml"), + (TEST_TOOL, "success", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), (PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), + (PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"), + (TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), + (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"), # TODO: fix these # (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), # (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), ]) -def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None): +def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): + global server n_predict = 512 server.n_slots = 1 server.jinja = True @@ -169,10 +189,12 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str server.n_predict = n_predict server.model_hf_repo = hf_repo server.model_hf_file = None - if template_override: + if isinstance(template_override, tuple): (template_hf_repo, template_variant) = template_override server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja" assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template." + elif isinstance(template_override, str): + server.chat_template = template_override server.start(timeout_seconds=TIMEOUT_SERVER_START) res = server.make_request("POST", "/chat/completions", data={ "max_tokens": n_predict, @@ -251,33 +273,55 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t @pytest.mark.slow @pytest.mark.parametrize("hf_repo,template_override", [ + ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M", ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")), ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), - ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), + ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), + ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), + ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), + ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), - ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), - ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), + ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), + + ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), + ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), + + ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), + ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), + ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), + ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), + ("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)), + ("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"), + ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), + ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"), + + # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. + ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), + # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), # ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), ]) -def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | None] | None): +def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None): global server + n_predict = 512 server.n_slots = 1 server.jinja = True server.n_ctx = 8192 - server.n_predict = 512 + server.n_predict = n_predict server.model_hf_repo = hf_repo server.model_hf_file = None - if template_override: + if isinstance(template_override, tuple): (template_hf_repo, template_variant) = template_override server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja" assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template." + elif isinstance(template_override, str): + server.chat_template = template_override server.start(timeout_seconds=TIMEOUT_SERVER_START) res = server.make_request("POST", "/chat/completions", data={ - "max_tokens": 256, + "max_tokens": n_predict, "messages": [ {"role": "user", "content": "What is the weather in Istanbul?"}, ], @@ -298,19 +342,39 @@ def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | Non @pytest.mark.slow @pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [ - (None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), + (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), + (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)), - ('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), - (None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), + (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"), + + (None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), + ('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), + + ('{"code":"print("}', "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), + (None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"), + ('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), + ('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"), + (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), - (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), - (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")), + (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), + + (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), + (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), + + (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")), + (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), + (None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), + (None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), + + # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. + (None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), + # (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), ]) -def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None): +def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): global server server.n_slots = 1 server.jinja = True @@ -318,10 +382,12 @@ def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: server.n_predict = 128 server.model_hf_repo = hf_repo server.model_hf_file = None - if template_override: + if isinstance(template_override, tuple): (template_hf_repo, template_variant) = template_override server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja" assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template." + elif isinstance(template_override, str): + server.chat_template = template_override server.start(timeout_seconds=TIMEOUT_SERVER_START) res = server.make_request("POST", "/chat/completions", data={ "max_tokens": 256, diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html index 882570c81..1ad85e739 100644 --- a/examples/server/webui/index.html +++ b/examples/server/webui/index.html @@ -154,7 +154,6 @@ placeholder="Type a message (Shift+Enter to add a new line)" v-model="inputMsg" @keydown.enter.exact.prevent="sendMessage" - :disabled="isGenerating" id="msg-input" dir="auto" > diff --git a/examples/server/webui/src/main.js b/examples/server/webui/src/main.js index 90f4ca368..50197a355 100644 --- a/examples/server/webui/src/main.js +++ b/examples/server/webui/src/main.js @@ -468,7 +468,10 @@ const mainApp = createApp({ URL.revokeObjectURL(url); }, async sendMessage() { - if (!this.inputMsg) return; + // prevent sending empty message + // also allow typing the message while generating, but does not allow sending it (to match UX/UI behavior of other chat apps) + if (!this.inputMsg || this.isGenerating) return; + const currConvId = this.viewingConvId; StorageUtils.appendMsg(currConvId, { diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 7c069e420..75b5ea3b4 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -274,22 +274,25 @@ endif() # Generate version info based on git commit. -find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH) -execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_VARIABLE GGML_BUILD_NUMBER - OUTPUT_STRIP_TRAILING_WHITESPACE -) +if(NOT DEFINED GGML_BUILD_NUMBER) + find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH) + execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE GGML_BUILD_NUMBER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) -if(GGML_BUILD_NUMBER EQUAL 1) - message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.") + if(GGML_BUILD_NUMBER EQUAL 1) + message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.") + endif() + + execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE GGML_BUILD_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) endif() -execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_VARIABLE GGML_BUILD_COMMIT - OUTPUT_STRIP_TRAILING_WHITESPACE -) # Capture variables prefixed with GGML_. diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index eb03e10fa..f4a468363 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -46,6 +46,9 @@ endif() message(STATUS "HIP and hipBLAS found") +# Workaround old compilers +set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --gpu-max-threads-per-block=1024") + file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh") list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h") diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 76f8e4291..9605914ff 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -20,7 +20,10 @@ #define GGML_METAL_MAX_COMMAND_BUFFERS 8 // create residency sets only on macOS >= 15.0 -#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 +#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 || \ + TARGET_OS_IOS && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000 || \ + TARGET_OS_TV && __TV_OS_VERSION_MAX_ALLOWED >= 180000 || \ + TARGET_OS_VISION && __VISION_OS_VERSION_MAX_ALLOWED >= 200000 #define GGML_METAL_HAS_RESIDENCY_SETS 1 #endif @@ -1071,7 +1074,7 @@ static bool ggml_backend_metal_buffer_rset_init( } #if defined(GGML_METAL_HAS_RESIDENCY_SETS) - if (@available(macOS 15.0, *)) { + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) { MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init]; desc.label = @"ggml_backend_metal"; desc.initialCapacity = ctx->n_buffers; @@ -1106,7 +1109,7 @@ static bool ggml_backend_metal_buffer_rset_init( // rset free static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) { #if defined(GGML_METAL_HAS_RESIDENCY_SETS) - if (@available(macOS 15.0, *)) { + if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) { if (ctx->rset) { [ctx->rset endResidency]; [ctx->rset removeAllAllocations]; diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 34f1cbf69..26a105f64 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -498e0ecd2c4f9379439fd413805af10e8e9ff349 +694244a6e40dc255f6bb4376fb17431c06633e6c diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 9956c1f1f..50bd40738 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -289,7 +289,7 @@ static void test_template(const common_chat_template & tmpl, const std::vector"); test_template(tmpl, end_tokens, text_message, tools, - "<|START_RESPONSE|>Hello, world!<|END_RESPONSE|>", + "<|START_RESPONSE|>Hello, world!\n" + "What's up?<|END_RESPONSE|>", /* expect_grammar_triggered= */ false); } { @@ -428,7 +429,7 @@ static void test_template_output_parsers() { assert_msg_equals(msg_from_json(text_message), common_chat_parse("{\n" - " \"response\": \"Hello, world!\"\n" + " \"response\": \"Hello, world!\\nWhat's up?\"\n" "}", common_chat_params_init(tmpl, inputs_tools).format)); test_template(tmpl, end_tokens, tool_call_message_with_id, tools, @@ -451,7 +452,7 @@ static void test_template_output_parsers() { assert_equals(COMMON_CHAT_FORMAT_MISTRAL_NEMO, common_chat_params_init(tmpl, inputs_tools).format); - test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); + test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); test_template( tmpl, end_tokens, tool_call_message_with_id, tools, "[TOOL_CALLS][{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, \"id\": \"123456789\"}]"); @@ -476,7 +477,7 @@ static void test_template_output_parsers() { inputs_tools) .format); - test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); + test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, tool_call_message, tools, "\n" "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" @@ -516,7 +517,7 @@ static void test_template_output_parsers() { assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X, common_chat_params_init(tmpl, inputs_tools).format); - test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); + test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, tool_call_message, tools, "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}"); } @@ -528,7 +529,7 @@ static void test_template_output_parsers() { assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1, common_chat_params_init(tmpl, inputs_tools).format); - test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); + test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, tool_call_message, tools, "{\"arg1\": 1}"); } @@ -542,7 +543,8 @@ static void test_template_output_parsers() { test_template(tmpl, end_tokens, text_message, {}, "all\n" - "Hello, world!", + "Hello, world!\n" + "What's up?", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, tool_call_message, tools, "special_function\n" @@ -555,7 +557,7 @@ static void test_template_output_parsers() { assert_equals(COMMON_CHAT_FORMAT_FIREFUNCTION_V2, common_chat_params_init(tmpl, inputs_tools).format); - test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); + test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, tool_call_message, tools, " functools[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]"); } @@ -566,7 +568,7 @@ static void test_template_output_parsers() { assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, common_chat_params_init(tmpl, inputs_tools).format); - test_template(tmpl, end_tokens, text_message, tools, "Hello, world!", /* expect_grammar_triggered= */ false); + test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); test_template(tmpl, end_tokens, tool_call_message, tools, "<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n" "```json\n"