diff --git a/curated_transformers/tokenizers/legacy/bbpe_tokenizer.py b/curated_transformers/tokenizers/legacy/bbpe_tokenizer.py index 79a842b3..b60ca437 100644 --- a/curated_transformers/tokenizers/legacy/bbpe_tokenizer.py +++ b/curated_transformers/tokenizers/legacy/bbpe_tokenizer.py @@ -64,7 +64,7 @@ def _encode(self, input: Iterable[MergedInputChunks]) -> PiecesWithIds: for chunk in seq: if isinstance(chunk, MergedSpecialPieceChunk): - piece_id = self.processor.piece_id(chunk.piece) + piece_id = self.processor.piece_to_id(chunk.piece) if piece_id is None: raise ValueError(f"Unknown special piece: {chunk.piece}") seq_ids.append(piece_id) diff --git a/curated_transformers/tokenizers/legacy/roberta_tokenizer.py b/curated_transformers/tokenizers/legacy/roberta_tokenizer.py index 5f0b328e..dcc3441a 100644 --- a/curated_transformers/tokenizers/legacy/roberta_tokenizer.py +++ b/curated_transformers/tokenizers/legacy/roberta_tokenizer.py @@ -131,7 +131,7 @@ def _load_from_vocab_files( def _get_piece_id_or_fail(processor: ByteBPEProcessor, piece: str): - piece_id = processor.piece_id(piece) + piece_id = processor.piece_to_id(piece) if piece_id is None: raise ValueError( f"RoBERTa piece encoder vocabulary doesn't contain '{piece}' piece" diff --git a/curated_transformers/tokenizers/legacy/sentencepiece_tokenizer.py b/curated_transformers/tokenizers/legacy/sentencepiece_tokenizer.py index 7053143f..279c03a5 100644 --- a/curated_transformers/tokenizers/legacy/sentencepiece_tokenizer.py +++ b/curated_transformers/tokenizers/legacy/sentencepiece_tokenizer.py @@ -52,12 +52,8 @@ def _encode(self, input: Iterable[MergedInputChunks]) -> PiecesWithIds: for chunk in seq: if isinstance(chunk, MergedSpecialPieceChunk): - # TODO: this is not ideal. piece_id() should probably return - # None for unknown pieces. - unk_id = self.processor.unk_id() - unk_piece = self.processor.id_to_piece(unk_id) piece_id = self.processor.piece_to_id(chunk.piece) - if piece_id == unk_id and chunk.piece != unk_piece: + if piece_id is None: raise ValueError(f"Unknown special piece: {chunk.piece}") seq_ids.append(piece_id) seq_pieces.append(chunk.piece) diff --git a/requirements.txt b/requirements.txt index ccb00791..458cbf3e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -curated-tokenizers>=0.0.7,<0.1.0 +curated-tokenizers>=0.9.0.dev0,<1.0.0 huggingface-hub>=0.14 tokenizers>=0.13.3 torch>=1.12.0 diff --git a/setup.cfg b/setup.cfg index 2854e618..4d352212 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,7 +14,7 @@ zip_safe = true include_package_data = true python_requires = >=3.8 install_requires = - curated-tokenizers>=0.0.7,<0.1.0 + curated-tokenizers>=0.9.0.dev0,<1.0.0 huggingface-hub>=0.14 tokenizers>=0.13.3 torch>=1.12.0