From cfcfd3a2ff8673177cac2489b9d83fbcf2388a81 Mon Sep 17 00:00:00 2001 From: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com> Date: Tue, 9 Jul 2024 20:00:50 +0530 Subject: [PATCH 1/2] Add support for custom tokenizer --- examples/apps/fastapi_server.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/apps/fastapi_server.py b/examples/apps/fastapi_server.py index 972cb90d6..777e27a1e 100755 --- a/examples/apps/fastapi_server.py +++ b/examples/apps/fastapi_server.py @@ -79,11 +79,13 @@ async def __call__(self, host, port): @click.command() @click.argument("model_dir") +@click.argument("tokenizer_path") @click.option("--host", type=str, default=None) @click.option("--port", type=int, default=8000) @click.option("--max_beam_width", type=int, default=1) @click.option("--tp_size", type=int, default=1) def entrypoint(model_dir: str, + tokenizer_path: str, host: Optional[str] = None, port: int = 8000, max_beam_width: int = 1, @@ -95,6 +97,7 @@ def entrypoint(model_dir: str, build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width) llm = LLM(model_dir, + tokenizer_path, tensor_parallel_size=tp_size, build_config=build_config) From 6e6a1fc5b76db0873ece474a25f7ac74ae546256 Mon Sep 17 00:00:00 2001 From: Utkarsh <49331882+uppalutkarsh@users.noreply.github.com> Date: Tue, 9 Jul 2024 20:22:18 +0530 Subject: [PATCH 2/2] Add support for custom tokenizer and max_batch_size --- examples/apps/fastapi_server.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/apps/fastapi_server.py b/examples/apps/fastapi_server.py index 777e27a1e..59d0226bf 100755 --- a/examples/apps/fastapi_server.py +++ b/examples/apps/fastapi_server.py @@ -84,17 +84,19 @@ async def __call__(self, host, port): @click.option("--port", type=int, default=8000) @click.option("--max_beam_width", type=int, default=1) @click.option("--tp_size", type=int, default=1) +@click.option("--max_batch_size", type=int, default=10) def entrypoint(model_dir: str, tokenizer_path: str, host: Optional[str] = None, port: int = 8000, max_beam_width: int = 1, - tp_size: int = 1): + tp_size: int = 1, + max_batch_size: int = 10): host = host or "0.0.0.0" port = port or 8000 logging.info(f"Starting server at {host}:{port}") - build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width) + build_config = BuildConfig(max_batch_size=max_batch_size, max_beam_width=max_beam_width) llm = LLM(model_dir, tokenizer_path,