gguf-hash: add cpp and python implementation of layer + model wide hashing
This commit is contained in:
parent
a38b884c6c
commit
d8dd43f94f
12 changed files with 7981 additions and 0 deletions
6
Makefile
6
Makefile
|
@ -14,6 +14,7 @@ BUILD_TARGETS = \
|
|||
llama-finetune \
|
||||
llama-gbnf-validator \
|
||||
llama-gguf \
|
||||
llama-gguf-hash \
|
||||
llama-gguf-split \
|
||||
llama-gritlm \
|
||||
llama-imatrix \
|
||||
|
@ -1178,6 +1179,11 @@ llama-gguf: examples/gguf/gguf.cpp \
|
|||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.c examples/gguf-hash/deps/xxhash/xxhash.c\
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-gguf-split: examples/gguf-split/gguf-split.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
|
|
|
@ -23,6 +23,7 @@ else()
|
|||
add_subdirectory(export-lora)
|
||||
add_subdirectory(finetune)
|
||||
add_subdirectory(gbnf-validator)
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf-split)
|
||||
add_subdirectory(gguf)
|
||||
add_subdirectory(gritlm)
|
||||
|
|
13
examples/gguf-hash/CMakeLists.txt
Normal file
13
examples/gguf-hash/CMakeLists.txt
Normal file
|
@ -0,0 +1,13 @@
|
|||
set(TARGET llama-gguf-hash)
|
||||
add_executable(${TARGET} gguf-hash.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
|
||||
# clibs dependencies
|
||||
include_directories(deps/)
|
||||
add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
|
||||
target_link_libraries(${TARGET} PRIVATE xxhash)
|
||||
add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
|
||||
target_link_libraries(${TARGET} PRIVATE sha1)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
28
examples/gguf-hash/README.md
Normal file
28
examples/gguf-hash/README.md
Normal file
|
@ -0,0 +1,28 @@
|
|||
## GGUF hash Example
|
||||
|
||||
CLI to hash GGUF files.
|
||||
|
||||
**Command line options:**
|
||||
|
||||
- `--xxhash`: use xhash (default)
|
||||
- `--sha1`: use sha1
|
||||
- `--uuid`: use uuid
|
||||
|
||||
### Compile Example
|
||||
|
||||
```
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
|
||||
make -C build clean
|
||||
make -C build llama-gguf-hash VERBOSE=1
|
||||
./build/bin/llama-gguf-hash test.gguf
|
||||
./build/bin/llama-gguf-hash --xxhash test.gguf
|
||||
./build/bin/llama-gguf-hash --sha1 test.gguf
|
||||
./build/bin/llama-gguf-hash --uuid test.gguf
|
||||
```
|
||||
|
||||
### Crypto/Hash Libraries Used
|
||||
|
||||
These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
|
||||
|
||||
- https://github.com/mofosyne/xxHash
|
||||
- https://github.com/clibs/sha1/
|
9
examples/gguf-hash/deps/sha1/package.json
Normal file
9
examples/gguf-hash/deps/sha1/package.json
Normal file
|
@ -0,0 +1,9 @@
|
|||
{
|
||||
"name": "sha1",
|
||||
"version": "0.0.1",
|
||||
"repo": "clibs/sha1",
|
||||
"description": "sha1 hash algorithm",
|
||||
"keywords": ["sha1", "hash"],
|
||||
"license": "public domain",
|
||||
"src": ["sha1.c", "sha1.h"]
|
||||
}
|
295
examples/gguf-hash/deps/sha1/sha1.c
Normal file
295
examples/gguf-hash/deps/sha1/sha1.c
Normal file
|
@ -0,0 +1,295 @@
|
|||
/*
|
||||
SHA-1 in C
|
||||
By Steve Reid <steve@edmweb.com>
|
||||
100% Public Domain
|
||||
|
||||
Test Vectors (from FIPS PUB 180-1)
|
||||
"abc"
|
||||
A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
|
||||
"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
|
||||
84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
|
||||
A million repetitions of "a"
|
||||
34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
|
||||
*/
|
||||
|
||||
/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
|
||||
/* #define SHA1HANDSOFF * Copies data before messing with it. */
|
||||
|
||||
#define SHA1HANDSOFF
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
/* for uint32_t */
|
||||
#include <stdint.h>
|
||||
|
||||
#include "sha1.h"
|
||||
|
||||
|
||||
#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
|
||||
|
||||
/* blk0() and blk() perform the initial expand. */
|
||||
/* I got the idea of expanding during the round function from SSLeay */
|
||||
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||
#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
|
||||
|(rol(block->l[i],8)&0x00FF00FF))
|
||||
#elif BYTE_ORDER == BIG_ENDIAN
|
||||
#define blk0(i) block->l[i]
|
||||
#else
|
||||
#error "Endianness not defined!"
|
||||
#endif
|
||||
#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
|
||||
^block->l[(i+2)&15]^block->l[i&15],1))
|
||||
|
||||
/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
|
||||
#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
|
||||
#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
|
||||
#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
|
||||
#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
|
||||
#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
|
||||
|
||||
|
||||
/* Hash a single 512-bit block. This is the core of the algorithm. */
|
||||
|
||||
void SHA1Transform(
|
||||
uint32_t state[5],
|
||||
const unsigned char buffer[64]
|
||||
)
|
||||
{
|
||||
uint32_t a, b, c, d, e;
|
||||
|
||||
typedef union
|
||||
{
|
||||
unsigned char c[64];
|
||||
uint32_t l[16];
|
||||
} CHAR64LONG16;
|
||||
|
||||
#ifdef SHA1HANDSOFF
|
||||
CHAR64LONG16 block[1]; /* use array to appear as a pointer */
|
||||
|
||||
memcpy(block, buffer, 64);
|
||||
#else
|
||||
/* The following had better never be used because it causes the
|
||||
* pointer-to-const buffer to be cast into a pointer to non-const.
|
||||
* And the result is written through. I threw a "const" in, hoping
|
||||
* this will cause a diagnostic.
|
||||
*/
|
||||
CHAR64LONG16 *block = (const CHAR64LONG16 *) buffer;
|
||||
#endif
|
||||
/* Copy context->state[] to working vars */
|
||||
a = state[0];
|
||||
b = state[1];
|
||||
c = state[2];
|
||||
d = state[3];
|
||||
e = state[4];
|
||||
/* 4 rounds of 20 operations each. Loop unrolled. */
|
||||
R0(a, b, c, d, e, 0);
|
||||
R0(e, a, b, c, d, 1);
|
||||
R0(d, e, a, b, c, 2);
|
||||
R0(c, d, e, a, b, 3);
|
||||
R0(b, c, d, e, a, 4);
|
||||
R0(a, b, c, d, e, 5);
|
||||
R0(e, a, b, c, d, 6);
|
||||
R0(d, e, a, b, c, 7);
|
||||
R0(c, d, e, a, b, 8);
|
||||
R0(b, c, d, e, a, 9);
|
||||
R0(a, b, c, d, e, 10);
|
||||
R0(e, a, b, c, d, 11);
|
||||
R0(d, e, a, b, c, 12);
|
||||
R0(c, d, e, a, b, 13);
|
||||
R0(b, c, d, e, a, 14);
|
||||
R0(a, b, c, d, e, 15);
|
||||
R1(e, a, b, c, d, 16);
|
||||
R1(d, e, a, b, c, 17);
|
||||
R1(c, d, e, a, b, 18);
|
||||
R1(b, c, d, e, a, 19);
|
||||
R2(a, b, c, d, e, 20);
|
||||
R2(e, a, b, c, d, 21);
|
||||
R2(d, e, a, b, c, 22);
|
||||
R2(c, d, e, a, b, 23);
|
||||
R2(b, c, d, e, a, 24);
|
||||
R2(a, b, c, d, e, 25);
|
||||
R2(e, a, b, c, d, 26);
|
||||
R2(d, e, a, b, c, 27);
|
||||
R2(c, d, e, a, b, 28);
|
||||
R2(b, c, d, e, a, 29);
|
||||
R2(a, b, c, d, e, 30);
|
||||
R2(e, a, b, c, d, 31);
|
||||
R2(d, e, a, b, c, 32);
|
||||
R2(c, d, e, a, b, 33);
|
||||
R2(b, c, d, e, a, 34);
|
||||
R2(a, b, c, d, e, 35);
|
||||
R2(e, a, b, c, d, 36);
|
||||
R2(d, e, a, b, c, 37);
|
||||
R2(c, d, e, a, b, 38);
|
||||
R2(b, c, d, e, a, 39);
|
||||
R3(a, b, c, d, e, 40);
|
||||
R3(e, a, b, c, d, 41);
|
||||
R3(d, e, a, b, c, 42);
|
||||
R3(c, d, e, a, b, 43);
|
||||
R3(b, c, d, e, a, 44);
|
||||
R3(a, b, c, d, e, 45);
|
||||
R3(e, a, b, c, d, 46);
|
||||
R3(d, e, a, b, c, 47);
|
||||
R3(c, d, e, a, b, 48);
|
||||
R3(b, c, d, e, a, 49);
|
||||
R3(a, b, c, d, e, 50);
|
||||
R3(e, a, b, c, d, 51);
|
||||
R3(d, e, a, b, c, 52);
|
||||
R3(c, d, e, a, b, 53);
|
||||
R3(b, c, d, e, a, 54);
|
||||
R3(a, b, c, d, e, 55);
|
||||
R3(e, a, b, c, d, 56);
|
||||
R3(d, e, a, b, c, 57);
|
||||
R3(c, d, e, a, b, 58);
|
||||
R3(b, c, d, e, a, 59);
|
||||
R4(a, b, c, d, e, 60);
|
||||
R4(e, a, b, c, d, 61);
|
||||
R4(d, e, a, b, c, 62);
|
||||
R4(c, d, e, a, b, 63);
|
||||
R4(b, c, d, e, a, 64);
|
||||
R4(a, b, c, d, e, 65);
|
||||
R4(e, a, b, c, d, 66);
|
||||
R4(d, e, a, b, c, 67);
|
||||
R4(c, d, e, a, b, 68);
|
||||
R4(b, c, d, e, a, 69);
|
||||
R4(a, b, c, d, e, 70);
|
||||
R4(e, a, b, c, d, 71);
|
||||
R4(d, e, a, b, c, 72);
|
||||
R4(c, d, e, a, b, 73);
|
||||
R4(b, c, d, e, a, 74);
|
||||
R4(a, b, c, d, e, 75);
|
||||
R4(e, a, b, c, d, 76);
|
||||
R4(d, e, a, b, c, 77);
|
||||
R4(c, d, e, a, b, 78);
|
||||
R4(b, c, d, e, a, 79);
|
||||
/* Add the working vars back into context.state[] */
|
||||
state[0] += a;
|
||||
state[1] += b;
|
||||
state[2] += c;
|
||||
state[3] += d;
|
||||
state[4] += e;
|
||||
/* Wipe variables */
|
||||
a = b = c = d = e = 0;
|
||||
#ifdef SHA1HANDSOFF
|
||||
memset(block, '\0', sizeof(block));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/* SHA1Init - Initialize new context */
|
||||
|
||||
void SHA1Init(
|
||||
SHA1_CTX * context
|
||||
)
|
||||
{
|
||||
/* SHA1 initialization constants */
|
||||
context->state[0] = 0x67452301;
|
||||
context->state[1] = 0xEFCDAB89;
|
||||
context->state[2] = 0x98BADCFE;
|
||||
context->state[3] = 0x10325476;
|
||||
context->state[4] = 0xC3D2E1F0;
|
||||
context->count[0] = context->count[1] = 0;
|
||||
}
|
||||
|
||||
|
||||
/* Run your data through this. */
|
||||
|
||||
void SHA1Update(
|
||||
SHA1_CTX * context,
|
||||
const unsigned char *data,
|
||||
uint32_t len
|
||||
)
|
||||
{
|
||||
uint32_t i;
|
||||
|
||||
uint32_t j;
|
||||
|
||||
j = context->count[0];
|
||||
if ((context->count[0] += len << 3) < j)
|
||||
context->count[1]++;
|
||||
context->count[1] += (len >> 29);
|
||||
j = (j >> 3) & 63;
|
||||
if ((j + len) > 63)
|
||||
{
|
||||
memcpy(&context->buffer[j], data, (i = 64 - j));
|
||||
SHA1Transform(context->state, context->buffer);
|
||||
for (; i + 63 < len; i += 64)
|
||||
{
|
||||
SHA1Transform(context->state, &data[i]);
|
||||
}
|
||||
j = 0;
|
||||
}
|
||||
else
|
||||
i = 0;
|
||||
memcpy(&context->buffer[j], &data[i], len - i);
|
||||
}
|
||||
|
||||
|
||||
/* Add padding and return the message digest. */
|
||||
|
||||
void SHA1Final(
|
||||
unsigned char digest[20],
|
||||
SHA1_CTX * context
|
||||
)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
unsigned char finalcount[8];
|
||||
|
||||
unsigned char c;
|
||||
|
||||
#if 0 /* untested "improvement" by DHR */
|
||||
/* Convert context->count to a sequence of bytes
|
||||
* in finalcount. Second element first, but
|
||||
* big-endian order within element.
|
||||
* But we do it all backwards.
|
||||
*/
|
||||
unsigned char *fcp = &finalcount[8];
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
uint32_t t = context->count[i];
|
||||
|
||||
int j;
|
||||
|
||||
for (j = 0; j < 4; t >>= 8, j++)
|
||||
*--fcp = (unsigned char) t}
|
||||
#else
|
||||
for (i = 0; i < 8; i++)
|
||||
{
|
||||
finalcount[i] = (unsigned char) ((context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255); /* Endian independent */
|
||||
}
|
||||
#endif
|
||||
c = 0200;
|
||||
SHA1Update(context, &c, 1);
|
||||
while ((context->count[0] & 504) != 448)
|
||||
{
|
||||
c = 0000;
|
||||
SHA1Update(context, &c, 1);
|
||||
}
|
||||
SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */
|
||||
for (i = 0; i < 20; i++)
|
||||
{
|
||||
digest[i] = (unsigned char)
|
||||
((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
|
||||
}
|
||||
/* Wipe variables */
|
||||
memset(context, '\0', sizeof(*context));
|
||||
memset(&finalcount, '\0', sizeof(finalcount));
|
||||
}
|
||||
|
||||
void SHA1(
|
||||
char *hash_out,
|
||||
const char *str,
|
||||
uint32_t len)
|
||||
{
|
||||
SHA1_CTX ctx;
|
||||
unsigned int ii;
|
||||
|
||||
SHA1Init(&ctx);
|
||||
for (ii=0; ii<len; ii+=1)
|
||||
SHA1Update(&ctx, (const unsigned char*)str + ii, 1);
|
||||
SHA1Final((unsigned char *)hash_out, &ctx);
|
||||
}
|
||||
|
52
examples/gguf-hash/deps/sha1/sha1.h
Normal file
52
examples/gguf-hash/deps/sha1/sha1.h
Normal file
|
@ -0,0 +1,52 @@
|
|||
#ifndef SHA1_H
|
||||
#define SHA1_H
|
||||
|
||||
/*
|
||||
SHA-1 in C
|
||||
By Steve Reid <steve@edmweb.com>
|
||||
100% Public Domain
|
||||
*/
|
||||
|
||||
#include "stdint.h"
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t state[5];
|
||||
uint32_t count[2];
|
||||
unsigned char buffer[64];
|
||||
} SHA1_CTX;
|
||||
|
||||
void SHA1Transform(
|
||||
uint32_t state[5],
|
||||
const unsigned char buffer[64]
|
||||
);
|
||||
|
||||
void SHA1Init(
|
||||
SHA1_CTX * context
|
||||
);
|
||||
|
||||
void SHA1Update(
|
||||
SHA1_CTX * context,
|
||||
const unsigned char *data,
|
||||
uint32_t len
|
||||
);
|
||||
|
||||
void SHA1Final(
|
||||
unsigned char digest[20],
|
||||
SHA1_CTX * context
|
||||
);
|
||||
|
||||
void SHA1(
|
||||
char *hash_out,
|
||||
const char *str,
|
||||
uint32_t len);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* SHA1_H */
|
12
examples/gguf-hash/deps/xxhash/clib.json
Normal file
12
examples/gguf-hash/deps/xxhash/clib.json
Normal file
|
@ -0,0 +1,12 @@
|
|||
{
|
||||
"name": "xxhash",
|
||||
"version": "0.8.2",
|
||||
"repo": "mofosyne/xxhash",
|
||||
"description": "Extremely fast non-cryptographic hash algorithm",
|
||||
"keywords": ["xxhash", "hashing"],
|
||||
"license": "BSD-2-Clause",
|
||||
"src": [
|
||||
"xxhash.c",
|
||||
"xxhash.h"
|
||||
]
|
||||
}
|
42
examples/gguf-hash/deps/xxhash/xxhash.c
Normal file
42
examples/gguf-hash/deps/xxhash/xxhash.c
Normal file
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* xxHash - Extremely Fast Hash algorithm
|
||||
* Copyright (C) 2012-2023 Yann Collet
|
||||
*
|
||||
* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following disclaimer
|
||||
* in the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* You can contact the author at:
|
||||
* - xxHash homepage: https://www.xxhash.com
|
||||
* - xxHash source repository: https://github.com/Cyan4973/xxHash
|
||||
*/
|
||||
|
||||
/*
|
||||
* xxhash.c instantiates functions defined in xxhash.h
|
||||
*/
|
||||
|
||||
#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
|
||||
#define XXH_IMPLEMENTATION /* access definitions */
|
||||
|
||||
#include "xxhash.h"
|
7089
examples/gguf-hash/deps/xxhash/xxhash.h
Normal file
7089
examples/gguf-hash/deps/xxhash/xxhash.h
Normal file
File diff suppressed because it is too large
Load diff
343
examples/gguf-hash/gguf-hash.cpp
Normal file
343
examples/gguf-hash/gguf-hash.cpp
Normal file
|
@ -0,0 +1,343 @@
|
|||
#include "ggml.h"
|
||||
|
||||
#include "stdlib.h" /* abort() */
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "xxhash/xxhash.h"
|
||||
#include "sha1/sha1.h"
|
||||
|
||||
#ifdef SHA256 // TODO: https://github.com/jb55/sha256.c
|
||||
#include "sha256/sha256.h"
|
||||
#endif
|
||||
|
||||
// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
|
||||
#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
|
||||
#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
|
||||
|
||||
struct hash_params {
|
||||
std::string input;
|
||||
bool xxhash = false;
|
||||
bool sha1 = false;
|
||||
bool uuid = false;
|
||||
#ifdef SHA256
|
||||
bool sha256 = false;
|
||||
#endif
|
||||
};
|
||||
|
||||
static void hash_print_usage(const char * executable) {
|
||||
const hash_params default_params;
|
||||
printf("\n");
|
||||
printf("usage: %s [options] GGUF_IN\n", executable);
|
||||
printf("\n");
|
||||
printf("Hash a GGUF file");
|
||||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" --xxhash use xxhash\n");
|
||||
printf(" --sha1 use sha1\n");
|
||||
printf(" --uuid use uuid\n");
|
||||
#ifdef SHA256
|
||||
printf(" --sha256 use sha256\n");
|
||||
#endif
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) {
|
||||
std::string arg;
|
||||
const std::string arg_prefix = "--";
|
||||
|
||||
int arg_idx = 1;
|
||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||
arg = argv[arg_idx];
|
||||
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||
}
|
||||
|
||||
bool arg_found = false;
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
hash_print_usage(argv[0]);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (arg == "--xxhash") {
|
||||
arg_found = true;
|
||||
params.xxhash = true;
|
||||
}
|
||||
|
||||
if (arg == "--sha1") {
|
||||
arg_found = true;
|
||||
params.sha1 = true;
|
||||
}
|
||||
|
||||
if (arg == "--uuid") {
|
||||
arg_found = true;
|
||||
params.uuid = true;
|
||||
}
|
||||
|
||||
#ifdef SHA256
|
||||
if (arg == "--sha256") {
|
||||
arg_found = true;
|
||||
params.sha256 = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!arg_found) {
|
||||
throw std::invalid_argument("error: unknown argument: " + arg);
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.xxhash
|
||||
&& !params.sha1
|
||||
&& !params.uuid
|
||||
#ifdef SHA256
|
||||
&& !params.sha256
|
||||
#endif
|
||||
) {
|
||||
// By default if no swich argument provided, assume xxhash
|
||||
params.xxhash = true;
|
||||
}
|
||||
|
||||
if (argc - arg_idx < 1) {
|
||||
throw std::invalid_argument("error: bad arguments");
|
||||
}
|
||||
|
||||
params.input = argv[arg_idx++];
|
||||
}
|
||||
|
||||
static bool hash_params_parse(int argc, const char ** argv, hash_params & params) {
|
||||
bool result = true;
|
||||
try {
|
||||
hash_params_parse_ex(argc, argv, params);
|
||||
}
|
||||
catch (const std::invalid_argument & ex) {
|
||||
fprintf(stderr, "%s\n", ex.what());
|
||||
hash_print_usage(argv[0]);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static bool gguf_hash(const hash_params & hash_params) {
|
||||
const std::string & fname = hash_params.input;
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx_data,
|
||||
};
|
||||
|
||||
// xxhash init
|
||||
XXH64_state_t* xxhash_model_hash_state = NULL;
|
||||
if (hash_params.xxhash) {
|
||||
xxhash_model_hash_state = XXH64_createState();
|
||||
if (xxhash_model_hash_state==NULL) {
|
||||
abort();
|
||||
}
|
||||
|
||||
XXH64_hash_t const seed = 0;
|
||||
if (XXH64_reset(xxhash_model_hash_state, seed) == XXH_ERROR) {
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
// sha1 init
|
||||
SHA1_CTX sha1_model_hash_ctx;
|
||||
if (hash_params.sha1) {
|
||||
SHA1Init(&sha1_model_hash_ctx);
|
||||
}
|
||||
|
||||
#ifdef SHA256
|
||||
// sha256 init
|
||||
sha256_t sha256_model_hash_ctx;
|
||||
if (hash_params.sha256) {
|
||||
sha256_init(&sha256_model_hash_ctx);
|
||||
}
|
||||
#endif
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||
auto n_bytes = ggml_nbytes(cur);
|
||||
auto *raw_data = cur->data;
|
||||
|
||||
if (hash_params.xxhash) {
|
||||
|
||||
// Per Layer Hash
|
||||
XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
|
||||
|
||||
char hex_result[17];
|
||||
for (int offset = 0; offset < 8; offset++) {
|
||||
unsigned int shift_bits_by = (8 * (8 - offset - 1));
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
|
||||
}
|
||||
|
||||
printf("xxhash %s %s:%s\n", hex_result, fname.c_str(), name);
|
||||
|
||||
// Overall Model Hash
|
||||
if (XXH64_update(xxhash_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
|
||||
}
|
||||
|
||||
if (hash_params.sha1) {
|
||||
|
||||
// Per Layer Hash
|
||||
char result[21]; // sha1 outputs 20 bytes
|
||||
SHA1( result, (const char *)raw_data, n_bytes);
|
||||
|
||||
char hex_result[41] = {0};
|
||||
for (int offset = 0; offset < 20; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
printf("sha1 %s %s:%s\n", hex_result, fname.c_str(), name);
|
||||
|
||||
// Overall Model Hash
|
||||
SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
|
||||
}
|
||||
|
||||
#ifdef SHA256
|
||||
if (hash_params.sha256) {
|
||||
|
||||
// Per Layer Hash
|
||||
unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
|
||||
sha256_hash( result, (const unsigned char *)raw_data, n_bytes);
|
||||
|
||||
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
|
||||
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
printf("sha256 %s %s:%s\n", hex_result, fname.c_str(), name);
|
||||
|
||||
// Overall Model Hash
|
||||
sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (hash_params.xxhash) {
|
||||
XXH64_hash_t const hash = XXH64_digest(xxhash_model_hash_state);
|
||||
|
||||
char hex_result[17];
|
||||
for (int offset = 0; offset < 8; offset++) {
|
||||
unsigned int shift_bits_by = (8 * (8 - offset - 1));
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
|
||||
}
|
||||
|
||||
printf("xxhash %s %s\n", hex_result, fname.c_str());
|
||||
}
|
||||
|
||||
if (hash_params.sha1) {
|
||||
unsigned char result[21];
|
||||
SHA1Final(result, &sha1_model_hash_ctx);
|
||||
|
||||
char hex_result[41];
|
||||
for (int offset = 0; offset < 20; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
printf("sha1 %s %s\n", hex_result, fname.c_str());
|
||||
}
|
||||
|
||||
#ifdef SHA256
|
||||
if (hash_params.sha256) {
|
||||
unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
|
||||
sha256_final( &sha256_model_hash_ctx, result);
|
||||
|
||||
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
|
||||
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
|
||||
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
|
||||
}
|
||||
|
||||
printf("sha256 %s %s\n", hex_result, fname.c_str());
|
||||
}
|
||||
#endif
|
||||
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
|
||||
// Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
|
||||
// Assumes that digest was processed correctly with the expected namespace
|
||||
for (int i = 0; i < 16; i++) {
|
||||
uuid[i] = sha1_digest[i];
|
||||
}
|
||||
|
||||
// Set bits corresponding to UUID ver 5
|
||||
uuid[ 6] &= ~(0xF << 4);
|
||||
uuid[ 6] |= (5 << 4);
|
||||
|
||||
// Set bits corresponding to UUID variant 0b10XX
|
||||
uuid[ 8] &= ~(0xc << 4);
|
||||
uuid[ 8] |= (0x8 << 4);
|
||||
}
|
||||
|
||||
static bool gguf_uuid(const hash_params & hash_params) {
|
||||
if (!hash_params.uuid) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const std::string & fname = hash_params.input;
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx_data,
|
||||
};
|
||||
|
||||
// sha1 init
|
||||
SHA1_CTX sha1_model_hash_ctx;
|
||||
SHA1Init(&sha1_model_hash_ctx);
|
||||
|
||||
unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
|
||||
SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||
auto n_bytes = ggml_nbytes(cur);
|
||||
auto *raw_data = cur->data;
|
||||
SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
|
||||
}
|
||||
|
||||
unsigned char result[21];
|
||||
SHA1Final(result, &sha1_model_hash_ctx);
|
||||
|
||||
unsigned char uuid[16];
|
||||
generate_uuidv5(result, uuid);
|
||||
|
||||
char string_buffer[37] = {0};
|
||||
sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
uuid[0], uuid[1], uuid[2], uuid[3],
|
||||
uuid[4], uuid[5], uuid[6], uuid[7],
|
||||
uuid[8], uuid[9], uuid[10], uuid[11],
|
||||
uuid[12], uuid[13], uuid[14], uuid[15]);
|
||||
printf("UUIDv5 %s %s\n", string_buffer, fname.c_str());
|
||||
|
||||
ggml_free(ctx_data);
|
||||
gguf_free(ctx);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, const char ** argv) {
|
||||
hash_params params;
|
||||
hash_params_parse(argc, argv, params);
|
||||
|
||||
gguf_hash(params);
|
||||
gguf_uuid(params);
|
||||
|
||||
return 0;
|
||||
}
|
91
gguf-py/scripts/gguf-hash.py
Executable file
91
gguf-py/scripts/gguf-hash.py
Executable file
|
@ -0,0 +1,91 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
import hashlib
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
# Necessary to load the local gguf package
|
||||
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from gguf import GGUFReader # noqa: E402
|
||||
|
||||
|
||||
logger = logging.getLogger("gguf-hash")
|
||||
|
||||
# UUID_NAMESPACE_LLAMA_CPP = uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
|
||||
UUID_NAMESPACE_LLAMA_CPP = uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5')
|
||||
|
||||
|
||||
# For more information about what field.parts and field.data represent,
|
||||
# please see the comments in the modify_gguf.py example.
|
||||
def gguf_hash(reader: GGUFReader, filename: str, disable_progress_bar) -> None:
|
||||
sha1 = hashlib.sha1()
|
||||
uuidv5_sha1 = hashlib.sha1()
|
||||
uuidv5_sha1.update(UUID_NAMESPACE_LLAMA_CPP.bytes)
|
||||
|
||||
# Total Weight Calculation For Progress Bar
|
||||
total_weights = 0
|
||||
for n, tensor in enumerate(reader.tensors, 1):
|
||||
|
||||
# We don't need these
|
||||
if tensor.name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
||||
continue
|
||||
|
||||
# Calculate Tensor Volume
|
||||
sum_weights_in_tensor = 1
|
||||
for dim in tensor.shape:
|
||||
sum_weights_in_tensor *= dim
|
||||
total_weights += sum_weights_in_tensor
|
||||
|
||||
# Hash Progress Bar
|
||||
bar = tqdm(desc="Hashing", total=total_weights, unit="weights", unit_scale=True, disable=disable_progress_bar)
|
||||
|
||||
# Hashing Process
|
||||
for n, tensor in enumerate(reader.tensors, 1):
|
||||
|
||||
# We don't need these
|
||||
if tensor.name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
||||
continue
|
||||
|
||||
# Progressbar
|
||||
sum_weights_in_tensor = 1
|
||||
for dim in tensor.shape:
|
||||
sum_weights_in_tensor *= dim
|
||||
bar.update(sum_weights_in_tensor)
|
||||
|
||||
sha1_layer = hashlib.sha1()
|
||||
sha1_layer.update(tensor.data)
|
||||
sha1.update(tensor.data)
|
||||
uuidv5_sha1.update(tensor.data)
|
||||
print("sha1 {0} {1}:{2}".format(sha1_layer.hexdigest(), filename, tensor.name)) # noqa: NP100
|
||||
|
||||
# Flush Hash Progress Bar
|
||||
bar.close()
|
||||
|
||||
# Display Hash Output
|
||||
print("sha1 {0} {1}".format(sha1.hexdigest(), filename)) # noqa: NP100
|
||||
print("UUIDv5 {0} {1}".format(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5), filename)) # noqa: NP100
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Dump GGUF file metadata")
|
||||
parser.add_argument("model", type=str, help="GGUF format model filename")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
parser.add_argument("--progressbar", action="store_true", help="enable progressbar")
|
||||
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
reader = GGUFReader(args.model, 'r')
|
||||
gguf_hash(reader, args.model, not args.progressbar)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue