This commit is contained in:
Robert Collins 2024-11-17 00:34:52 +01:00 committed by GitHub
commit 6b27075768
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 306 additions and 72 deletions

190
.gitignore vendored
View file

@ -134,3 +134,193 @@ poetry.toml
# Test models for lora adapters
/lora-tests
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

View file

@ -744,8 +744,8 @@ class Model:
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_sentencepiece(self, add_to_gguf=True):
tokens, scores, toktypes = self._create_vocab_sentencepiece()
def _set_vocab_sentencepiece(self, add_to_gguf=True, use_tokenizer_json=False):
tokens, scores, toktypes = self._create_vocab_sentencepiece(use_tokenizer_json)
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
@ -756,7 +756,7 @@ class Model:
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def _create_vocab_sentencepiece(self):
def _create_vocab_sentencepiece(self, use_tokenizer_json=False):
from sentencepiece import SentencePieceProcessor
tokenizer_path = self.dir_model / 'tokenizer.model'
@ -764,77 +764,114 @@ class Model:
if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}")
tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))
try:
tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [gguf.TokenType.UNUSED] * vocab_size
for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id)
for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id)
toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.IsUnknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.IsControl(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.IsUnused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE
toktype = gguf.TokenType.NORMAL
if tokenizer.IsUnknown(token_id):
toktype = gguf.TokenType.UNKNOWN
elif tokenizer.IsControl(token_id):
toktype = gguf.TokenType.CONTROL
elif tokenizer.IsUnused(token_id):
toktype = gguf.TokenType.UNUSED
elif tokenizer.IsByte(token_id):
toktype = gguf.TokenType.BYTE
tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype
tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype
added_tokens_file = self.dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
# Handle added tokens from added_tokens.json
added_tokens_file = self.dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
added_tokens_json = json.load(f)
for key in added_tokens_json:
token_id = added_tokens_json[key]
if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
tokens[token_id] = key.encode("utf-8")
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
tokens[token_id] = key.encode("utf-8")
scores[token_id] = -1000.0
toktypes[token_id] = gguf.TokenType.USER_DEFINED
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
for token_id, token_data in added_tokens_decoder.items():
token_id = int(token_id)
token: str = token_data["content"]
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
if tokens[token_id] != token.encode("utf-8"):
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
if token_data.get("special") or self.does_token_look_special(token):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
else:
token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
# Handle added tokens from tokenizer.json (Salamandra models)
if use_tokenizer_json:
tokenizer_json_file = self.dir_model / 'tokenizer.json'
if tokenizer_json_file.is_file():
with open(tokenizer_json_file, 'r', encoding='utf-8') as f:
tokenizer_json = json.load(f)
added_tokens = tokenizer_json.get('added_tokens', [])
for token_data in added_tokens:
token = token_data.get('content')
token_id = token_data.get('id')
if token is None or token_id is None:
logger.warning(f'Missing token content or id in tokenizer.json: {token_data}')
continue
if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
scores[token_id] = -1000.0
tokens[token_id] = token.encode("utf-8")
tokens[token_id] = token.encode("utf-8")
scores[token_id] = -1000.0
toktypes[token_id] = gguf.TokenType.USER_DEFINED
else:
logger.warning(f"tokenizer.json file not found at {tokenizer_json_file}")
if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
for i in range(1, pad_count + 1):
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
for token_id_str, token_data in added_tokens_decoder.items():
token_id = int(token_id_str)
token: str = token_data.get("content")
if token is None:
logger.warning(f'Missing token content in tokenizer_config.json for token_id {token_id}')
continue
if token_id >= vocab_size:
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
if toktypes[token_id] != gguf.TokenType.UNUSED:
if tokens[token_id] != token.encode("utf-8"):
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
if token_data.get("special") or self.does_token_look_special(token):
toktypes[token_id] = gguf.TokenType.CONTROL
else:
token = token.replace("\u2581", " ") # pre-normalize user-defined spaces
toktypes[token_id] = gguf.TokenType.USER_DEFINED
return tokens, scores, toktypes
scores[token_id] = -1000.0
tokens[token_id] = token.encode("utf-8")
else:
logger.debug(f"tokenizer_config.json file not found at {tokenizer_config_file}")
if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
for i in range(1, pad_count + 1):
tokens.append(f"[PAD{i}]".encode("utf-8"))
scores.append(-1000.0)
toktypes.append(gguf.TokenType.UNUSED)
return tokens, scores, toktypes
except Exception as e:
logger.error(f"Exception occurred in _create_vocab_sentencepiece: {e}")
raise # Re-raise the exception to handle it appropriately
def _set_vocab_llama_hf(self):
vocab = gguf.LlamaHfVocab(self.dir_model)
@ -1516,25 +1553,32 @@ class StableLMModel(Model):
raise ValueError(f"Unprocessed norms: {norms}")
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", "SalamandraForCausalLM")
class LlamaModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
tokenizer_model_file = self.dir_model / 'tokenizer.model'
tokenizer_json_file = self.dir_model / 'tokenizer.json'
if tokenizer_model_file.is_file() and tokenizer_json_file.is_file():
# Handle Salamandra models with both tokenizer.model and tokenizer.json
self._set_vocab_sentencepiece(use_tokenizer_json=True)
else:
try:
self._set_vocab_llama_hf()
except (FileNotFoundError, TypeError):
# Llama 3
self._set_vocab_gpt2()
self._set_vocab_sentencepiece()
except FileNotFoundError:
try:
self._set_vocab_llama_hf()
except (FileNotFoundError, TypeError):
# Llama 3
self._set_vocab_gpt2()
# Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
if self.hparams.get("vocab_size", 32000) == 32016:
special_vocab = gguf.SpecialVocab(
self.dir_model, load_merges=False,
special_token_types = ['prefix', 'suffix', 'middle', 'eot']
special_token_types=['prefix', 'suffix', 'middle', 'eot']
)
special_vocab._set_special_token("prefix", 32007)
special_vocab._set_special_token("suffix", 32008)