chore: Fix model path references
This commit is contained in:
parent
b6f70b8a0e
commit
006bb60d27
1 changed files with 10 additions and 6 deletions
|
@ -174,12 +174,12 @@ class HFVocabRequest(HFHubBase):
|
||||||
except requests.exceptions.HTTPError as e:
|
except requests.exceptions.HTTPError as e:
|
||||||
self.logger.error(f"Failed to download tokenizer {model['name']}: {e}")
|
self.logger.error(f"Failed to download tokenizer {model['name']}: {e}")
|
||||||
|
|
||||||
def download_model(self) -> None:
|
def download_models(self) -> None:
|
||||||
for model in self.models:
|
for model in self.models:
|
||||||
os.makedirs(f"{self.local_path}/{model['repo']}", exist_ok=True)
|
os.makedirs(f"{self.model_path}/{model['repo']}", exist_ok=True)
|
||||||
filenames = self.resolve_filenames(model['tokt'])
|
filenames = self.resolve_filenames(model['tokt'])
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
filepath = pathlib.Path(f"{self.local_path}/{model['repo']}/{filename}")
|
filepath = pathlib.Path(f"{self.model_path}/{model['repo']}/{filename}")
|
||||||
if filepath.is_file():
|
if filepath.is_file():
|
||||||
self.logger.info(f"skipped pre-existing tokenizer {model['repo']} in {filepath}")
|
self.logger.info(f"skipped pre-existing tokenizer {model['repo']} in {filepath}")
|
||||||
continue
|
continue
|
||||||
|
@ -189,19 +189,19 @@ class HFVocabRequest(HFHubBase):
|
||||||
checksums = []
|
checksums = []
|
||||||
for model in self.models:
|
for model in self.models:
|
||||||
mapping = {}
|
mapping = {}
|
||||||
filepath = f"{self.local_path}/{model['repo']}"
|
filepath = f"{self.model_path}/{model['repo']}"
|
||||||
tokenizer = AutoTokenizer.from_pretrained(filepath, trust_remote=True)
|
tokenizer = AutoTokenizer.from_pretrained(filepath, trust_remote=True)
|
||||||
mapping.update(model)
|
mapping.update(model)
|
||||||
mapping['checksum'] = sha256(str(tokenizer.vocab).encode()).hexdigest()
|
mapping['checksum'] = sha256(str(tokenizer.vocab).encode()).hexdigest()
|
||||||
self.logger.info(f"Hashed {mapping['repo']} as {mapping['checksum']}")
|
self.logger.info(f"Hashed {mapping['repo']} as {mapping['checksum']}")
|
||||||
checksums.append(mapping)
|
checksums.append(mapping)
|
||||||
|
|
||||||
with open(f"{self.local_path.parent}/checksums.json") as file:
|
with open(f"{self.model_path.parent}/checksums.json") as file:
|
||||||
json.dump(checksums, file)
|
json.dump(checksums, file)
|
||||||
|
|
||||||
def log_pre_tokenizer_info(self) -> None:
|
def log_pre_tokenizer_info(self) -> None:
|
||||||
for model in self.models:
|
for model in self.models:
|
||||||
with open(f"{self.local_path}/{model['repo']}/tokenizer.json", "r", encoding="utf-8") as f:
|
with open(f"{self.model_path}/{model['repo']}/tokenizer.json", "r", encoding="utf-8") as f:
|
||||||
cfg = json.load(f)
|
cfg = json.load(f)
|
||||||
self.logger.info(f"normalizer: {json.dumps(cfg['normalizer'], indent=4)}")
|
self.logger.info(f"normalizer: {json.dumps(cfg['normalizer'], indent=4)}")
|
||||||
self.logger.info(f"pre_tokenizer: {json.dumps(cfg['pre_tokenizer'], indent=4)}")
|
self.logger.info(f"pre_tokenizer: {json.dumps(cfg['pre_tokenizer'], indent=4)}")
|
||||||
|
@ -221,3 +221,7 @@ class HFModelRequest(HFHubBase):
|
||||||
logger: None | logging.Logger
|
logger: None | logging.Logger
|
||||||
):
|
):
|
||||||
super().__init__(model_path, auth_token, logger)
|
super().__init__(model_path, auth_token, logger)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_type(self) -> ModelType:
|
||||||
|
return ModelType
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue