Skip to content

Commit

Permalink
[tokenizer] Adds getters for HuggingfaceTokenizer (#2958)
Browse files Browse the repository at this point in the history
  • Loading branch information
frankfliu committed Apr 26, 2024
1 parent 33c2d60 commit f94883d
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,53 @@ public void enableBatch() {
}
}

/**
* Returns the truncation policy.
*
* @return the truncation policy
*/
public String getTruncation() {
return truncation.name();
}

/**
* Returns the padding policy.
*
* @return the padding policy
*/
public String getPadding() {
return padding.name();
}

/**
* Returns the max token length.
*
* @return the max token length
*/
public int getMaxLength() {
return maxLength;
}

/**
* Returns the stride to use in overflow overlap when truncating sequences longer than the model
* supports.
*
* @return the stride to use in overflow overlap when truncating sequences longer than the model
* supports
*/
public int getStride() {
return stride;
}

/**
* Returns the padToMultipleOf for padding.
*
* @return the padToMultipleOf for padding
*/
public int getPadToMultipleOf() {
return padToMultipleOf;
}

/**
* Creates a builder to build a {@code HuggingFaceTokenizer}.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ public void testTokenizer() throws IOException {
};

try (HuggingFaceTokenizer tokenizer = HuggingFaceTokenizer.newInstance("bert-base-cased")) {
Assert.assertEquals(tokenizer.getTruncation(), "DO_NOT_TRUNCATE");
Assert.assertEquals(tokenizer.getPadding(), "DO_NOT_PAD");
Assert.assertEquals(tokenizer.getMaxLength(), -1);
Assert.assertEquals(tokenizer.getStride(), 0);
Assert.assertEquals(tokenizer.getPadToMultipleOf(), 0);

List<String> ret = tokenizer.tokenize(input);
Assert.assertEquals(ret.toArray(Utils.EMPTY_ARRAY), expected);
Encoding encoding = tokenizer.encode(input);
Expand Down

0 comments on commit f94883d

Please sign in to comment.