diff --git a/src/regex.rs b/src/regex.rs index f2772fc..3fdf55c 100644 --- a/src/regex.rs +++ b/src/regex.rs @@ -12,6 +12,8 @@ pub const GPT2_SPLIT_PATTERN: &str = pub const GPT4_SPLIT_PATTERN: &str = r"'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"; +pub const UNICODE_SPLIT_PATTERN: &str = r"'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+[\p{L}\p{M}]+|\p{N}{1,3}| ?[^\s\p{L}\p{M}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"; + /// Specifies how to handle special tokens during encoding. /// /// This enum is used to control the behavior of the `encode_special` function @@ -272,7 +274,7 @@ pub struct RegexTokenizerStruct { impl Default for RegexTokenizerStruct { fn default() -> Self { - Self::new(GPT4_SPLIT_PATTERN.to_string()) + Self::new(UNICODE_SPLIT_PATTERN.to_string()) } }