From 50ea3f212894f4b22507b9de05099d55118cb9d1 Mon Sep 17 00:00:00 2001 From: Frank Liu Date: Sat, 11 May 2024 18:49:32 -0700 Subject: [PATCH] [tokenizer] Refactor djl_convert python code --- extensions/tokenizers/.gitignore | 3 +++ .../tokenizers/src/main/python/.gitignore | 4 +-- .../djl_converter/fill_mask_converter.py | 2 +- .../djl_converter/huggingface_converter.py | 10 +++---- .../djl_converter/huggingface_models.py | 26 +++++++++---------- .../src/main/python/djl_converter/metadata.py | 17 +++++++++++- .../djl_converter/model_zoo_importer.py | 22 +++++----------- .../question_answering_converter.py | 2 +- .../sentence_similarity_converter.py | 2 +- .../text_classification_converter.py | 2 +- .../token_classification_converter.py | 2 +- 11 files changed, 49 insertions(+), 43 deletions(-) diff --git a/extensions/tokenizers/.gitignore b/extensions/tokenizers/.gitignore index 41e087b1789..171bfe581df 100644 --- a/extensions/tokenizers/.gitignore +++ b/extensions/tokenizers/.gitignore @@ -1,3 +1,6 @@ Cargo.lock /tokenizers /jnilib +model/ +tmp/ +models.json diff --git a/extensions/tokenizers/src/main/python/.gitignore b/extensions/tokenizers/src/main/python/.gitignore index a96c5d3f5d3..567994ceb19 100644 --- a/extensions/tokenizers/src/main/python/.gitignore +++ b/extensions/tokenizers/src/main/python/.gitignore @@ -1,4 +1,2 @@ __pycache__ -model/ -tmp/ -models.json +*.egg-info/ diff --git a/extensions/tokenizers/src/main/python/djl_converter/fill_mask_converter.py b/extensions/tokenizers/src/main/python/djl_converter/fill_mask_converter.py index ff9de4bea2e..5c225293f41 100644 --- a/extensions/tokenizers/src/main/python/djl_converter/fill_mask_converter.py +++ b/extensions/tokenizers/src/main/python/djl_converter/fill_mask_converter.py @@ -14,7 +14,7 @@ import torch -from huggingface_converter import HuggingfaceConverter +from djl_converter.huggingface_converter import HuggingfaceConverter class FillMaskConverter(HuggingfaceConverter): diff --git a/extensions/tokenizers/src/main/python/djl_converter/huggingface_converter.py b/extensions/tokenizers/src/main/python/djl_converter/huggingface_converter.py index 167b4ad8104..70df294a2e2 100644 --- a/extensions/tokenizers/src/main/python/djl_converter/huggingface_converter.py +++ b/extensions/tokenizers/src/main/python/djl_converter/huggingface_converter.py @@ -17,14 +17,14 @@ from argparse import Namespace import onnx -import safetensors_convert +from djl_converter.safetensors_convert import convert_file import torch from huggingface_hub import hf_hub_download, HfApi from transformers import pipeline, AutoTokenizer, AutoConfig -from metadata import HuggingfaceMetadata -from shasum import sha1_sum -from zip_utils import zip_dir +from djl_converter.metadata import HuggingfaceMetadata +from djl_converter.shasum import sha1_sum +from djl_converter.zip_utils import zip_dir class PipelineHolder(object): @@ -139,7 +139,7 @@ def save_rust_model(self, model_info, args: Namespace, temp_dir: str): elif has_pt_file: file = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin") - safetensors_convert.convert_file(file, target) + convert_file(file, target) else: return False, f"No model file found for: {model_id}", -1 diff --git a/extensions/tokenizers/src/main/python/djl_converter/huggingface_models.py b/extensions/tokenizers/src/main/python/djl_converter/huggingface_models.py index cba19d477e1..e856871d133 100644 --- a/extensions/tokenizers/src/main/python/djl_converter/huggingface_models.py +++ b/extensions/tokenizers/src/main/python/djl_converter/huggingface_models.py @@ -19,6 +19,12 @@ from huggingface_hub import HfApi from huggingface_hub import hf_hub_download from huggingface_hub.hf_api import ModelInfo +from djl_converter.fill_mask_converter import FillMaskConverter +from djl_converter.metadata import get_lang_tags +from djl_converter.question_answering_converter import QuestionAnsweringConverter +from djl_converter.sentence_similarity_converter import SentenceSimilarityConverter +from djl_converter.text_classification_converter import TextClassificationConverter +from djl_converter.token_classification_converter import TokenClassificationConverter ARCHITECTURES_2_TASK = { "ForQuestionAnswering": "question-answering", @@ -27,19 +33,13 @@ "ForMultipleChoice": "text-classification", "ForMaskedLM": "fill-mask", } -LANGUAGES = HfApi().get_model_tags()["language"] - - -def get_lang_tags(model_info): - tags = {} - for tag in model_info.tags: - if tag in LANGUAGES: - tags[tag] = "true" - - if not tags: - tags["en"] = "true" - - return tags +SUPPORTED_TASKS = { + "fill-mask": FillMaskConverter(), + "question-answering": QuestionAnsweringConverter(), + "sentence-similarity": SentenceSimilarityConverter(), + "text-classification": TextClassificationConverter(), + "token-classification": TokenClassificationConverter(), +} class HuggingfaceModels: diff --git a/extensions/tokenizers/src/main/python/djl_converter/metadata.py b/extensions/tokenizers/src/main/python/djl_converter/metadata.py index 7e835eb94da..3b61e96677b 100644 --- a/extensions/tokenizers/src/main/python/djl_converter/metadata.py +++ b/extensions/tokenizers/src/main/python/djl_converter/metadata.py @@ -11,7 +11,22 @@ # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for # the specific language governing permissions and limitations under the License. import json -from huggingface_models import get_lang_tags + +from huggingface_hub import HfApi + +LANGUAGES = HfApi().get_model_tags()["language"] + + +def get_lang_tags(model_info): + tags = {} + for tag in model_info.tags: + if tag in LANGUAGES: + tags[tag] = "true" + + if not tags: + tags["en"] = "true" + + return tags class HuggingfaceMetadata: diff --git a/extensions/tokenizers/src/main/python/djl_converter/model_zoo_importer.py b/extensions/tokenizers/src/main/python/djl_converter/model_zoo_importer.py index de7599a5a13..0538b23815f 100644 --- a/extensions/tokenizers/src/main/python/djl_converter/model_zoo_importer.py +++ b/extensions/tokenizers/src/main/python/djl_converter/model_zoo_importer.py @@ -11,25 +11,13 @@ # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for # the specific language governing permissions and limitations under the License. import logging -import os.path +import os import shutil import sys -from arg_parser import converter_args -from fill_mask_converter import FillMaskConverter -from huggingface_models import HuggingfaceModels -from question_answering_converter import QuestionAnsweringConverter -from sentence_similarity_converter import SentenceSimilarityConverter -from text_classification_converter import TextClassificationConverter -from token_classification_converter import TokenClassificationConverter +sys.path.append(os.path.dirname(os.path.realpath(__file__))) -SUPPORTED_TASK = { - "fill-mask": FillMaskConverter(), - "question-answering": QuestionAnsweringConverter(), - "sentence-similarity": SentenceSimilarityConverter(), - "text-classification": TextClassificationConverter(), - "token-classification": TokenClassificationConverter(), -} +from djl_converter.arg_parser import converter_args def main(): @@ -38,6 +26,8 @@ def main(): level=logging.INFO) args = converter_args() + from djl_converter.huggingface_models import HuggingfaceModels, SUPPORTED_TASKS + huggingface_models = HuggingfaceModels(args.output_dir) temp_dir = f"{args.output_dir}/tmp" @@ -48,7 +38,7 @@ def main(): for model in models: task = model["task"] model_info = model["model_info"] - converter = SUPPORTED_TASK[task] + converter = SUPPORTED_TASKS[task] try: result, reason, size = converter.save_model( diff --git a/extensions/tokenizers/src/main/python/djl_converter/question_answering_converter.py b/extensions/tokenizers/src/main/python/djl_converter/question_answering_converter.py index 935c2118606..dd0f73920cf 100644 --- a/extensions/tokenizers/src/main/python/djl_converter/question_answering_converter.py +++ b/extensions/tokenizers/src/main/python/djl_converter/question_answering_converter.py @@ -14,7 +14,7 @@ import torch -from huggingface_converter import HuggingfaceConverter +from djl_converter.huggingface_converter import HuggingfaceConverter class QuestionAnsweringConverter(HuggingfaceConverter): diff --git a/extensions/tokenizers/src/main/python/djl_converter/sentence_similarity_converter.py b/extensions/tokenizers/src/main/python/djl_converter/sentence_similarity_converter.py index 7d28db57ea8..68d6e422aaf 100644 --- a/extensions/tokenizers/src/main/python/djl_converter/sentence_similarity_converter.py +++ b/extensions/tokenizers/src/main/python/djl_converter/sentence_similarity_converter.py @@ -19,7 +19,7 @@ import torch from transformers import AutoTokenizer, AutoModel, AutoConfig -from huggingface_converter import HuggingfaceConverter, PipelineHolder +from djl_converter.huggingface_converter import HuggingfaceConverter, PipelineHolder from huggingface_hub import hf_hub_download diff --git a/extensions/tokenizers/src/main/python/djl_converter/text_classification_converter.py b/extensions/tokenizers/src/main/python/djl_converter/text_classification_converter.py index 8b1dba4efa4..56552d1114e 100644 --- a/extensions/tokenizers/src/main/python/djl_converter/text_classification_converter.py +++ b/extensions/tokenizers/src/main/python/djl_converter/text_classification_converter.py @@ -15,7 +15,7 @@ import torch -from huggingface_converter import HuggingfaceConverter +from djl_converter.huggingface_converter import HuggingfaceConverter class TextClassificationConverter(HuggingfaceConverter): diff --git a/extensions/tokenizers/src/main/python/djl_converter/token_classification_converter.py b/extensions/tokenizers/src/main/python/djl_converter/token_classification_converter.py index fead8d247b7..33f910d3c7b 100644 --- a/extensions/tokenizers/src/main/python/djl_converter/token_classification_converter.py +++ b/extensions/tokenizers/src/main/python/djl_converter/token_classification_converter.py @@ -14,7 +14,7 @@ import torch -from huggingface_converter import HuggingfaceConverter +from djl_converter.huggingface_converter import HuggingfaceConverter class TokenClassificationConverter(HuggingfaceConverter):