diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer.h b/tensorflow_text/core/kernels/whitespace_tokenizer.h index cc2d92b53..d249fda88 100644 --- a/tensorflow_text/core/kernels/whitespace_tokenizer.h +++ b/tensorflow_text/core/kernels/whitespace_tokenizer.h @@ -45,7 +45,8 @@ class WhitespaceTokenizerConfig { : config_(*config), max_codepoint_(config->length() * 8) {} inline bool IsWhitespace(const UChar32 codepoint) const { - return codepoint <= max_codepoint_ && + return codepoint != U_SENTINEL && + codepoint < max_codepoint_ && config_[codepoint >> 3] & (1 << (char)(codepoint & 0x7)); } diff --git a/tensorflow_text/core/kernels/whitespace_tokenizer_test.cc b/tensorflow_text/core/kernels/whitespace_tokenizer_test.cc index 84b6d5047..8f2519d54 100644 --- a/tensorflow_text/core/kernels/whitespace_tokenizer_test.cc +++ b/tensorflow_text/core/kernels/whitespace_tokenizer_test.cc @@ -62,6 +62,18 @@ TEST(WhitespaceTokenizerTest, Internationalization) { EXPECT_THAT(output_end_offsets, ElementsAre(5, 10, 15)); } +TEST(WhitespaceTokenizerTest, InvalidCodepoint) { + absl::string_view input("\xE3"); + std::vector output_tokens; + std::vector output_start_offsets; + std::vector output_end_offsets; + std::string config = BuildWhitespaceTokenizerConfig(); + WhitespaceTokenizer t(&config); + t.Tokenize(input, &output_tokens, &output_start_offsets, &output_end_offsets); + EXPECT_THAT(output_start_offsets, ElementsAre(0)); + EXPECT_THAT(output_end_offsets, ElementsAre(1)); +} + } // namespace } // namespace text } // namespace tensorflow