diff --git a/tests/mask_store/test_byte_tokenizer.py b/tests/mask_store/test_byte_tokenizer.py
index e5fe11d3..b456e9fa 100644
--- a/tests/mask_store/test_byte_tokenizer.py
+++ b/tests/mask_store/test_byte_tokenizer.py
@@ -65,13 +65,13 @@ def test_raw_tokenizer(self):
"""Test ByteTokenizer with a RAW (tiktoken-style) tokenizer."""
# Create mock vocabulary for a raw tokenizer
vocab = {
- "hello": 1,
- "world": 2,
- "!": 3,
- "你": 4,
- "好": 5,
- "吗": 6,
- "?": 7
+ b"hello": 1,
+ b"world": 2,
+ b"!": 3,
+ b"\xE4\xBD\xA0": 4, # 你
+ b"\xE5\xA5\xBD": 5, # 好
+ b"\xE5\x90": 6, # first two bytes of 吗
+ b"\x97": 7, # last byte of 吗
}
mock_tokenizer = self.create_mock_tokenizer(vocab, VocabType.RAW)
@@ -84,10 +84,10 @@ def test_raw_tokenizer(self):
mock_tokenizer.encode.return_value = expected_ids
# Test decoding
- token_ids = [4, 5, 6, 7] # 你, 好, 吗, ?
+ token_ids = [4, 5, 6] # 你, 好, 吗 (first two bytes).
mock_tokenizer.decode.return_value = "你好吗?"
result = byte_tokenizer.decode(token_ids)
- self.assertEqual(result.decode('utf-8'), "你好吗?")
+ self.assertEqual(result, b"\xE4\xBD\xA0\xE5\xA5\xBD\xE5\x90")
def test_byte_fallback_tokenizer(self):
"""Test ByteTokenizer with a BYTE_FALLBACK (Llama-2-style) tokenizer."""
@@ -151,11 +151,11 @@ def test_byte_level_tokenizer(self):
def test_batched_decoding(self):
"""Test batched decoding capabilities."""
vocab = {
- "hello": 1,
- "world": 2,
- "!": 3,
- "": 4, # special token
- "": 5, # special token
+ b"hello": 1,
+ b"world": 2,
+ b"!": 3,
+ b"": 4, # special token
+ b"": 5, # special token
}
mock_tokenizer = self.create_mock_tokenizer(vocab, VocabType.RAW)
@@ -196,10 +196,10 @@ def test_auto_detection(self):
def test_decoding_performance(self):
"""Test basic decoding performance."""
# Create a larger vocabulary for more realistic testing
- vocab = {f"token{i}": i for i in range(1000)}
+ vocab = {bytes(f"token{i}".encode('utf-8')): i for i in range(1000)}
# Add some special tokens
- vocab[""] = 1000
- vocab[""] = 1001
+ vocab[b""] = 1000
+ vocab[b""] = 1001
mock_tokenizer = self.create_mock_tokenizer(vocab, VocabType.RAW)
mock_tokenizer.all_special_ids = [1000, 1001]
@@ -313,10 +313,10 @@ def test_roundtrip_encoding_decoding(self):
"""Test encoding and decoding round-trip."""
# Create a simple vocabulary for testing
raw_vocab = {
- "hello": 1,
- " ": 2,
- "world": 3,
- "!": 4,
+ b"hello": 1,
+ b" ": 2,
+ b"world": 3,
+ b"!": 4,
}
mock_tokenizer = self.create_mock_tokenizer(raw_vocab, VocabType.RAW)