From 2073d0582ded653f19e84683b33fb8ab89de85eb Mon Sep 17 00:00:00 2001
From: ajaykg <ajay.gahlawat@gmail.com>
Date: Mon, 6 May 2024 06:11:01 +0530
Subject: [PATCH] Update regex.rs to not split words on combining marks and
 diacritics

Over half the world population seems to speak languages that use unicode combining marks like accents and matras in between the words. GPT / tictoken regular expressions seem to break such words in between preventing merges of characters that should actually merge. Edited the regular expression to not split on such combining characters.
---
 src/regex.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/regex.rs b/src/regex.rs
index f2772fc..3fdf55c 100644
--- a/src/regex.rs
+++ b/src/regex.rs
@@ -12,6 +12,8 @@ pub const GPT2_SPLIT_PATTERN: &str =
 
 pub const GPT4_SPLIT_PATTERN: &str = r"'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+";
 
+pub const UNICODE_SPLIT_PATTERN: &str = r"'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+[\p{L}\p{M}]+|\p{N}{1,3}| ?[^\s\p{L}\p{M}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+";
+
 /// Specifies how to handle special tokens during encoding.
 ///
 /// This enum is used to control the behavior of the `encode_special` function
@@ -272,7 +274,7 @@ pub struct RegexTokenizerStruct {
 
 impl Default for RegexTokenizerStruct {
     fn default() -> Self {
-        Self::new(GPT4_SPLIT_PATTERN.to_string())
+        Self::new(UNICODE_SPLIT_PATTERN.to_string())
     }
 }