This commit is contained in:
Robert Collins 2024-11-17 00:34:52 +01:00 committed by GitHub
commit 6b27075768
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 306 additions and 72 deletions

190
.gitignore vendored
View file

@ -134,3 +134,193 @@ poetry.toml
# Test models for lora adapters # Test models for lora adapters
/lora-tests /lora-tests
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

View file

@ -744,8 +744,8 @@ class Model:
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_sentencepiece(self, add_to_gguf=True): def _set_vocab_sentencepiece(self, add_to_gguf=True, use_tokenizer_json=False):
tokens, scores, toktypes = self._create_vocab_sentencepiece() tokens, scores, toktypes = self._create_vocab_sentencepiece(use_tokenizer_json)
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
@ -756,7 +756,7 @@ class Model:
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _create_vocab_sentencepiece(self): def _create_vocab_sentencepiece(self, use_tokenizer_json=False):
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
tokenizer_path = self.dir_model / 'tokenizer.model' tokenizer_path = self.dir_model / 'tokenizer.model'
@ -764,77 +764,114 @@ class Model:
if not tokenizer_path.is_file(): if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}") raise FileNotFoundError(f"File not found: {tokenizer_path}")
tokenizer = SentencePieceProcessor() try:
tokenizer.LoadFromFile(str(tokenizer_path)) tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size toktypes: list[int] = [gguf.TokenType.UNUSED] * vocab_size
for token_id in range(tokenizer.vocab_size()): for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id) piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8") text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id) score = tokenizer.GetScore(token_id)
toktype = SentencePieceTokenTypes.NORMAL toktype = gguf.TokenType.NORMAL
if tokenizer.IsUnknown(token_id): if tokenizer.IsUnknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN toktype = gguf.TokenType.UNKNOWN
elif tokenizer.IsControl(token_id): elif tokenizer.IsControl(token_id):
toktype = SentencePieceTokenTypes.CONTROL toktype = gguf.TokenType.CONTROL
elif tokenizer.IsUnused(token_id): elif tokenizer.IsUnused(token_id):
toktype = SentencePieceTokenTypes.UNUSED toktype = gguf.TokenType.UNUSED
elif tokenizer.IsByte(token_id): elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE toktype = gguf.TokenType.BYTE
tokens[token_id] = text tokens[token_id] = text
scores[token_id] = score scores[token_id] = score
toktypes[token_id] = toktype toktypes[token_id] = toktype
added_tokens_file = self.dir_model / 'added_tokens.json' # Handle added tokens from added_tokens.json
if added_tokens_file.is_file(): added_tokens_file = self.dir_model / 'added_tokens.json'
with open(added_tokens_file, "r", encoding="utf-8") as f: if added_tokens_file.is_file():
added_tokens_json = json.load(f) with open(added_tokens_file, "r", encoding="utf-8") as f:
for key in added_tokens_json: added_tokens_json = json.load(f)
token_id = added_tokens_json[key] for key in added_tokens_json:
if token_id >= vocab_size: token_id = added_tokens_json[key]
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') if token_id >= vocab_size:
continue logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
tokens[token_id] = key.encode("utf-8") tokens[token_id] = key.encode("utf-8")
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = gguf.TokenType.USER_DEFINED
tokenizer_config_file = self.dir_model / 'tokenizer_config.json' # Handle added tokens from tokenizer.json (Salamandra models)
if tokenizer_config_file.is_file(): if use_tokenizer_json:
with open(tokenizer_config_file, "r", encoding="utf-8") as f: tokenizer_json_file = self.dir_model / 'tokenizer.json'
tokenizer_config_json = json.load(f) if tokenizer_json_file.is_file():
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {}) with open(tokenizer_json_file, 'r', encoding='utf-8') as f:
for token_id, token_data in added_tokens_decoder.items(): tokenizer_json = json.load(f)
token_id = int(token_id) added_tokens = tokenizer_json.get('added_tokens', [])
token: str = token_data["content"] for token_data in added_tokens:
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: token = token_data.get('content')
if tokens[token_id] != token.encode("utf-8"): token_id = token_data.get('id')
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') if token is None or token_id is None:
if token_data.get("special") or self.does_token_look_special(token): logger.warning(f'Missing token content or id in tokenizer.json: {token_data}')
toktypes[token_id] = SentencePieceTokenTypes.CONTROL continue
else: if token_id >= vocab_size:
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED continue
scores[token_id] = -1000.0 tokens[token_id] = token.encode("utf-8")
tokens[token_id] = token.encode("utf-8") scores[token_id] = -1000.0
toktypes[token_id] = gguf.TokenType.USER_DEFINED
else:
logger.warning(f"tokenizer.json file not found at {tokenizer_json_file}")
if vocab_size > len(tokens): tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
pad_count = vocab_size - len(tokens) if tokenizer_config_file.is_file():
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") with open(tokenizer_config_file, "r", encoding="utf-8") as f:
for i in range(1, pad_count + 1): tokenizer_config_json = json.load(f)
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
scores.append(-1000.0) for token_id_str, token_data in added_tokens_decoder.items():
toktypes.append(SentencePieceTokenTypes.UNUSED) token_id = int(token_id_str)
token: str = token_data.get("content")
if token is None:
logger.warning(f'Missing token content in tokenizer_config.json for token_id {token_id}')
continue
if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
if toktypes[token_id] != gguf.TokenType.UNUSED:
if tokens[token_id] != token.encode("utf-8"):
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
if token_data.get("special") or self.does_token_look_special(token):
toktypes[token_id] = gguf.TokenType.CONTROL
else:
token = token.replace("\u2581", " ") # pre-normalize user-defined spaces
toktypes[token_id] = gguf.TokenType.USER_DEFINED
return tokens, scores, toktypes scores[token_id] = -1000.0
tokens[token_id] = token.encode("utf-8")
else:
logger.debug(f"tokenizer_config.json file not found at {tokenizer_config_file}")
if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
for i in range(1, pad_count + 1):
tokens.append(f"[PAD{i}]".encode("utf-8"))
scores.append(-1000.0)
toktypes.append(gguf.TokenType.UNUSED)
return tokens, scores, toktypes
except Exception as e:
logger.error(f"Exception occurred in _create_vocab_sentencepiece: {e}")
raise # Re-raise the exception to handle it appropriately
def _set_vocab_llama_hf(self): def _set_vocab_llama_hf(self):
vocab = gguf.LlamaHfVocab(self.dir_model) vocab = gguf.LlamaHfVocab(self.dir_model)
@ -1516,25 +1553,32 @@ class StableLMModel(Model):
raise ValueError(f"Unprocessed norms: {norms}") raise ValueError(f"Unprocessed norms: {norms}")
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") @Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "SalamandraForCausalLM")
class LlamaModel(Model): class LlamaModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA model_arch = gguf.MODEL_ARCH.LLAMA
def set_vocab(self): def set_vocab(self):
try: tokenizer_model_file = self.dir_model / 'tokenizer.model'
self._set_vocab_sentencepiece() tokenizer_json_file = self.dir_model / 'tokenizer.json'
except FileNotFoundError:
if tokenizer_model_file.is_file() and tokenizer_json_file.is_file():
# Handle Salamandra models with both tokenizer.model and tokenizer.json
self._set_vocab_sentencepiece(use_tokenizer_json=True)
else:
try: try:
self._set_vocab_llama_hf() self._set_vocab_sentencepiece()
except (FileNotFoundError, TypeError): except FileNotFoundError:
# Llama 3 try:
self._set_vocab_gpt2() self._set_vocab_llama_hf()
except (FileNotFoundError, TypeError):
# Llama 3
self._set_vocab_gpt2()
# Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
if self.hparams.get("vocab_size", 32000) == 32016: if self.hparams.get("vocab_size", 32000) == 32016:
special_vocab = gguf.SpecialVocab( special_vocab = gguf.SpecialVocab(
self.dir_model, load_merges=False, self.dir_model, load_merges=False,
special_token_types = ['prefix', 'suffix', 'middle', 'eot'] special_token_types=['prefix', 'suffix', 'middle', 'eot']
) )
special_vocab._set_special_token("prefix", 32007) special_vocab._set_special_token("prefix", 32007)
special_vocab._set_special_token("suffix", 32008) special_vocab._set_special_token("suffix", 32008)