diff --git a/extensions/tokenizers/src/main/python/huggingface_converter.py b/extensions/tokenizers/src/main/python/huggingface_converter.py index f3b85c241ec..1efabb61c14 100644 --- a/extensions/tokenizers/src/main/python/huggingface_converter.py +++ b/extensions/tokenizers/src/main/python/huggingface_converter.py @@ -41,9 +41,20 @@ def save_model(self, model_info, args: Namespace, temp_dir: str): if not os.path.exists(temp_dir): os.makedirs(temp_dir) - hf_pipeline = self.load_model(model_id) - # Save tokenizer.json to temp dir - self.save_tokenizer(hf_pipeline, temp_dir) + try: + hf_pipeline = self.load_model(model_id) + except Exception as e: + logging.warning(f"Failed to load model: {model_id}.") + logging.warning(e, exc_info=True) + return False, "Failed to load model", -1 + + try: + # Save tokenizer.json to temp dir + self.save_tokenizer(hf_pipeline, temp_dir) + except Exception as e: + logging.warning(f"Failed to save tokenizer: {model_id}.") + logging.warning(e, exc_info=True) + return False, "Failed to save tokenizer", -1 # Save config.json just for reference config = hf_hub_download(repo_id=model_id, filename="config.json") @@ -112,7 +123,7 @@ def jit_trace_model(self, hf_pipeline, model_id: str, temp_dir: str, logging.info(f"Saving torchscript model: {model_name}.pt ...") model_file = os.path.join(temp_dir, f"{model_name}.pt") script_module.save(model_file) - except (RuntimeError, ValueError) as e: + except Exception as e: logging.warning(f"Failed to trace model: {model_id}.") logging.warning(e, exc_info=True) return None diff --git a/extensions/tokenizers/src/main/python/huggingface_models.py b/extensions/tokenizers/src/main/python/huggingface_models.py index 549db378813..12db9cf2c0c 100644 --- a/extensions/tokenizers/src/main/python/huggingface_models.py +++ b/extensions/tokenizers/src/main/python/huggingface_models.py @@ -56,6 +56,8 @@ def __init__(self, output_dir: str): self.temp_dir = f"{self.output_dir}/tmp" def list_models(self, args: Namespace) -> List[dict]: + import_all = os.environ.get("HF_IMPORT_ALL") + api = HfApi() if args.model_name: models = api.list_models(filter="pytorch", @@ -63,16 +65,20 @@ def list_models(self, args: Namespace) -> List[dict]: sort="downloads", direction=-1, limit=args.limit) - if not models: - logging.warning(f"no model found: {args.model_name}.") + import_all = True else: models = api.list_models(filter=f"{args.category},pytorch", sort="downloads", direction=-1, limit=args.limit) - if not models: + if not models: + if args.model_name: + logging.warning(f"no model found: {args.model_name}.") + else: logging.warning(f"no model matches category: {args.category}.") + return [] + ret = [] for model_info in models: model_id = model_info.modelId @@ -83,7 +89,7 @@ def list_models(self, args: Namespace) -> List[dict]: continue languages = get_lang_tags(model_info) - if "en" not in languages and not os.environ["HF_IMPORT_ALL"]: + if "en" not in languages and not import_all: logging.warning(f"Skip non-English model: {model_id}.") continue @@ -94,6 +100,12 @@ def list_models(self, args: Namespace) -> List[dict]: logging.info(f"Skip converted model: {model_id}.") continue + if model_info.downloads < 50 and not import_all: + logging.info( + f"Skip model {model_info.modelId}, downloads {model_info.downloads} < 50" + ) + continue + try: config = hf_hub_download(repo_id=model_id, filename="config.json") diff --git a/extensions/tokenizers/src/main/python/model_zoo_importer.py b/extensions/tokenizers/src/main/python/model_zoo_importer.py index 9ed32ec58ef..0ed67bd1018 100644 --- a/extensions/tokenizers/src/main/python/model_zoo_importer.py +++ b/extensions/tokenizers/src/main/python/model_zoo_importer.py @@ -49,9 +49,17 @@ def main(): model_info = model["model_info"] converter = SUPPORTED_TASK[task] - result, reason, size = converter.save_model(model_info, args, temp_dir) - if not result: - logging.error(f"{model_info.modelId}: {reason}") + try: + result, reason, size = converter.save_model( + model_info, args, temp_dir) + if not result: + logging.error(f"{model_info.modelId}: {reason}") + except Exception as e: + logging.warning(f"Failed to convert model: {model_info.modelId}.") + logging.warning(e, exc_info=True) + result = False + reason = "Failed to convert model" + size = -1 huggingface_models.update_progress(model_info, converter.application, result, reason, size, args.cpu_only)