Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 19 additions & 19 deletions src/Tokenizer/BaseTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ public TokensWithOffsets TokenizeWithOffsets(string text)

foreach (var token in tokens)
{
texts.Add(token.Text);
texts.Add(new string(token.Text));
offsets.Add(token.ReferenceOffsets.Any() ? new Offset(token.ReferenceOffsets.First(), token.ReferenceOffsets.Last() + 1) : (Offset?)null);
originalPositions.Add(token.ReferenceOffsets);
masks.Add(token.Mask);
Expand Down Expand Up @@ -174,7 +174,7 @@ public virtual List<Token> TokenizeToTokens(Token initialToken)

return ownedToken;
})
.Where(token => !string.IsNullOrEmpty(token.Text))
.Where(token => !string.IsNullOrEmpty(new string(token.Text)))
.ToList();

return tokens;
Expand All @@ -183,7 +183,7 @@ public virtual List<Token> TokenizeToTokens(Token initialToken)
public void DecomposeNfkc(Token token)
{
// Perform NFKC normalization on the token text
var decomposedText = token.Text.Normalize(NormalizationForm.FormKC);
var decomposedText = new string(token.Text).Normalize(NormalizationForm.FormKC);

// Calculate the new reference offsets
var newReferenceOffsets = new List<uint>();
Expand All @@ -197,7 +197,7 @@ public void DecomposeNfkc(Token token)
}

// Update the token's properties
token.Text = decomposedText;
token.Text = decomposedText.ToCharArray();
token.ReferenceOffsets = newReferenceOffsets;
token.Offset.Begin = newReferenceOffsets.FirstOrDefault();
token.Offset.End = newReferenceOffsets.LastOrDefault() + 1;
Expand Down Expand Up @@ -357,11 +357,11 @@ private string ConvertTokensToString(List<string> tokens)
protected List<Token> SplitOnSpecialTokens(Token token, IVocab vocab)
{

Func<string, (int, int, Mask)> testSubstr = (s) =>
Func<char[], (int, int, Mask)> testSubstr = (s) =>
{
foreach (var specialValue in vocab.SpecialValues.Keys)
{
if (s.StartsWith(specialValue))
if (new string(s).StartsWith(specialValue))
{
return (
specialValue.Length,
Expand Down Expand Up @@ -400,7 +400,7 @@ private string CleanUpTokenization(string inputString)

private List<Token> WhitespaceTokenize(Token initialToken)
{
var parts = initialToken.Text.Split(new[] { ' ', '\t', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries);
var parts = new string(initialToken.Text).Split(new[] { ' ', '\t', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries);
var tokens = new List<Token>();
foreach (var part in parts)
{
Expand All @@ -410,7 +410,7 @@ private List<Token> WhitespaceTokenize(Token initialToken)
return tokens;
}

private List<Token> SplitOnSubstr(Token token, Func<string, (int, int, Mask)> testSubstr, bool addSeparators)
private List<Token> SplitOnSubstr(Token token, Func<char[], (int, int, Mask)> testSubstr, bool addSeparators)
{
var tokens = new List<Token>();
uint charBegin = 0;
Expand All @@ -420,7 +420,7 @@ private List<Token> SplitOnSubstr(Token token, Func<string, (int, int, Mask)> te
if (token.Mask == Mask.None)
{
// Iterate over characters with byte indices
var itr = TokenizationUtils.Enumerate(TokenizationUtils.CharIndicesForRunes(token.Text));
var itr = TokenizationUtils.Enumerate(TokenizationUtils.CharIndicesForRunes(new string(token.Text)));
foreach (var (charIdx, (bytesIdx, _)) in itr)
{
charCount++;
Expand All @@ -431,7 +431,7 @@ private List<Token> SplitOnSubstr(Token token, Func<string, (int, int, Mask)> te
if (charBegin < charIdx)
{
// Add previous token
var trimmedText = TokenizationUtils.SubstringRunes(token.Text, bytesBegin, bytesIdx - bytesBegin).TrimEnd();
var trimmedText = new string(TokenizationUtils.SubstringRunes(token.Text, bytesBegin, bytesIdx - bytesBegin)).TrimEnd();
if (trimmedText.EnumerateRunes().Count() > 0)
{
tokens.Add(new Token(trimmedText)
Expand Down Expand Up @@ -468,7 +468,7 @@ private List<Token> SplitOnSubstr(Token token, Func<string, (int, int, Mask)> te
var text = TokenizationUtils.SubstringRunes(token.Text, bytesBegin, bytesBegin + (bytesIdx - bytesBegin));
if (charCount == 0)
{
charCount = token.Text.EnumerateRunes().Count();
charCount = new string(token.Text).EnumerateRunes().Count();
}
tokens.Add(new Token(text)
{
Expand All @@ -493,7 +493,7 @@ private List<Token> SplitOnPunct(Token token)
if (char.IsPunctuation(charCurrent))
{
var offsets = token.ReferenceOffsets.Skip(start).Take(1).ToArray();
tokens.Add(new Token(text.Substring(start, 1), offsets) { Mask = Mask.Punctuation });
tokens.Add(new Token(new string(text).Substring(start, 1), offsets) { Mask = Mask.Punctuation });
start++;
}
else
Expand All @@ -504,7 +504,7 @@ private List<Token> SplitOnPunct(Token token)
end++;
}
var offsets = token.ReferenceOffsets.Skip(start).Take(end - start).ToArray();
tokens.Add(new Token(text.Substring(start, end - start), offsets));
tokens.Add(new Token(new string(text).Substring(start, end - start), offsets));
start = end;
}
}
Expand All @@ -523,7 +523,7 @@ private List<Token> TokenizeCjkChars(Token token)
if (IsCjkChar(charCurrent))
{
var offsets = token.ReferenceOffsets.Skip(start).Take(1).ToArray();
tokens.Add(new Token(text.Substring(start, 1), offsets) { Mask = Mask.CJK });
tokens.Add(new Token(new string(text).Substring(start, 1), offsets) { Mask = Mask.CJK });
start++;
}
else
Expand All @@ -534,7 +534,7 @@ private List<Token> TokenizeCjkChars(Token token)
end++;
}
var offsets = token.ReferenceOffsets.Skip(start).Take(end - start).ToArray();
tokens.Add(new Token(text.Substring(start, end - start), offsets));
tokens.Add(new Token(new string(text).Substring(start, end - start), offsets));
start = end;
}
}
Expand All @@ -551,19 +551,19 @@ private void CleanText(Token token, bool removeControlCharacters)
{
if (removeControlCharacters)
{
token.Text = Regex.Replace(token.Text, @"\p{C}+", "");
token.Text = Regex.Replace(new string(token.Text), @"\p{C}+", "").ToCharArray();
}
token.Text = token.Text.Replace("``", "\"").Replace("''", "\"");
token.Text = new string(token.Text).Replace("``", "\"").Replace("''", "\"").ToCharArray();
}

private void Lowercase(Token token)
{
token.Text = token.Text.ToLowerInvariant();
token.Text = new string(token.Text).ToLowerInvariant().ToCharArray();
}

private void StripAccents(Token token)
{
token.Text = RemoveDiacritics(token.Text);
token.Text = RemoveDiacritics(new string(token.Text)).ToCharArray();
}

private string RemoveDiacritics(string text)
Expand Down
16 changes: 8 additions & 8 deletions src/Tokenizer/Token.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ public class Token : IToken
/// <summary>
/// String representation
/// </summary>
public string Text { get; set; }
public char[] Text { get; set; }

/// <summary>
/// Start and end positions of the token with respect to the original text
Expand All @@ -30,9 +30,9 @@ public class Token : IToken
/// Creates a new owned token from a `String`.
/// </summary>
/// <param name="text">text reference</param>
public Token(string text)
public Token(ReadOnlySpan<char> text)
{
Text = text;
Text = text.ToArray();
var text_size = (uint)text.Length;
Offset = new Offset(0, text_size);
ReferenceOffsets = Enumerable.Range(0, (int)text_size).Select(i => (uint)i).ToList();
Expand All @@ -44,25 +44,25 @@ public Token(string text)
/// </summary>
/// <param name="text">text reference</param>
/// <param name="offsets">reference positions with respect to the original text</param>
public Token(string text, uint[] offsets)
public Token(ReadOnlySpan<char> text, uint[] offsets)
{
Text = text;
Text = text.ToArray();
Offset = new Offset(0, (uint)offsets.Length);
ReferenceOffsets = offsets;
Mask = Mask.None;
}

public Token(string text, Offset offset, IReadOnlyList<uint> referenceOffsets, Mask mask)
public Token(ReadOnlySpan<char> text, Offset offset, IReadOnlyList<uint> referenceOffsets, Mask mask)
{
Text = text;
Text = text.ToArray();
Offset = offset;
ReferenceOffsets = referenceOffsets;
Mask = mask;
}

public override string ToString()
{
return Text;
return new string(Text);
}

public static Token From(string text)
Expand Down
58 changes: 32 additions & 26 deletions src/Tokenizer/TokenizationUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,38 +11,31 @@ namespace Lokad.Tokenizers.Tokenizer;

internal static class TokenizationUtils
{

/// <summary>
/// Substring Runes (characters)
/// </summary>
public static string SubstringRunes(string text, int start, int length)
public static char[] SubstringRunes(ReadOnlySpan<char> text, int start, int length)
{
var sb = new StringBuilder();
text.EnumerateRunes().Skip(start).Take(length).ToList().ForEach(r => sb.Append(r));
return sb.ToString();
text.EnumerateRunes().ToList().Skip(start).Take(length).ToList().ForEach(r => sb.Append(r));
return sb.ToString().ToCharArray();
}

/// <summary>
/// Substring Runes (characters)
/// </summary>
public static string SubstringRunes(string text, int start)
public static char[] SubstringRunes(ReadOnlySpan<char> text, int start)
{
var sb = new StringBuilder();
text.EnumerateRunes().Skip(start).ToList().ForEach(r => sb.Append(r));
return sb.ToString();
}

/// <summary>
/// Get String Info
/// </summary>
public static StringInfo GetStringInfo(string text)
{
return new System.Globalization.StringInfo(text);
text.EnumerateRunes().ToList().Skip(start).ToList().ForEach(r => sb.Append(r));
return sb.ToString().ToCharArray();
}

/// <summary>
/// Get UTF 8 Bytes Count
/// </summary>
public static int GetUtf8BytesCount(string text)
public static int GetUtf8BytesCount(ReadOnlySpan<char> text)
{
return Encoding.UTF8.GetByteCount(text);
}
Expand All @@ -66,7 +59,7 @@ public static int GetUtf8BytesCount(string text)
/// NFKC decomposition
/// </summary>
public static IEnumerable<(Rune Character, int ExtraCharSize)> NFKC(string str)
{
{
var runes = str.EnumerateRunes().ToList();
for (var i = 0; i < runes.Count; i++)
{
Expand All @@ -89,18 +82,18 @@ public static int GetUtf8BytesCount(string text)
/// <summary>
/// Substring by byte offset
/// </summary>
public static string SubstringByByteOffset(string s, int start)
public static char[] SubstringByByteOffset(char[] s, int start)
{
var bytes = Encoding.UTF8.GetBytes(s);
var substringBytes = new byte[bytes.Length - start];
Array.Copy(bytes, start, substringBytes, 0, bytes.Length - start);
return Encoding.UTF8.GetString(substringBytes);
return Encoding.UTF8.GetChars(substringBytes);
}

/// <summary>
/// Substring by byte offset
/// </summary>
public static string SubstringByByteOffset(string s, int start, int end)
public static char[] SubstringByByteOffset(char[] s, int start, int end)
{
var bytes = Encoding.UTF8.GetBytes(s);
if (end > bytes.Length || start > end)
Expand All @@ -109,7 +102,7 @@ public static string SubstringByByteOffset(string s, int start, int end)
}
var substringBytes = new byte[end - start];
Array.Copy(bytes, start, substringBytes, 0, end - start);
return Encoding.UTF8.GetString(substringBytes);
return Encoding.UTF8.GetChars(substringBytes);
}

/// <summary>
Expand All @@ -120,7 +113,7 @@ public static void CleanText(Token token, bool strict)
var cleanedString = new StringBuilder(token.Text.Length);
var characterMapping = new List<uint>(token.Text.Length);

foreach (var (character, position) in token.Text.EnumerateRunes().Zip(token.ReferenceOffsets))
foreach (var (character, position) in token.Text.AsSpan().EnumerateRunes().ToList().Zip(token.ReferenceOffsets))
{
if (IsControl(character, strict) || character == new Rune('\x00') || character == new Rune('\uFFFD'))
{
Expand All @@ -131,7 +124,7 @@ public static void CleanText(Token token, bool strict)
characterMapping.Add(position);
}

token.Text = cleanedString.ToString();
token.Text = cleanedString.ToString().ToCharArray();
token.ReferenceOffsets = characterMapping;
token.Offset = new Offset(token.ReferenceOffsets.FirstOrDefault(), token.ReferenceOffsets.LastOrDefault() + 1);
}
Expand Down Expand Up @@ -191,7 +184,7 @@ public static void Lowercase(Token token)
}
}

token.Text = lowerCasedString.ToString();
token.Text = lowerCasedString.ToString().ToCharArray();
token.ReferenceOffsets = characterMapping;
token.Offset = new Offset(token.ReferenceOffsets.FirstOrDefault(), token.ReferenceOffsets.LastOrDefault() + 1);
}
Expand All @@ -205,13 +198,13 @@ public static void DecomposeNfkc(Token token)
var decomposedString = new StringBuilder(capacity);
var characterMapping = new List<uint>(capacity);
var curPosition = 0;
var normalizedString = token.Text.Normalize(NormalizationForm.FormKC);
var normalizedString = new string(token.Text).Normalize(NormalizationForm.FormKC);
foreach (var (character, currentExtraCharSize) in TokenizationUtils.NFKC(normalizedString))
{
var extraCharSize = 0;

//HINT: [@eslam] check if character is removed from the original text after normalization
if (!token.Text.EnumerateRunes().Contains(character))
if (!token.Text.AsSpan().EnumerateRunes().ToList().Contains(character))
extraCharSize -= currentExtraCharSize;

decomposedString.Append(character);
Expand All @@ -236,7 +229,7 @@ public static void DecomposeNfkc(Token token)
curPosition += 1; // Adjust based on Unicode character width if needed
}

token.Text = decomposedString.ToString();//.Normalize(NormalizationForm.FormKC);
token.Text = decomposedString.ToString().ToCharArray();//.Normalize(NormalizationForm.FormKC);
token.ReferenceOffsets = characterMapping;
token.Offset.Begin = token.ReferenceOffsets.FirstOrDefault();
token.Offset.End = token.ReferenceOffsets.LastOrDefault() + 1;
Expand Down Expand Up @@ -475,4 +468,17 @@ public static (TokenIdsWithOffsets, TokenIdsWithOffsets?, List<long>, List<Offse
}
}

/// <summary>
/// extension method to enumerate span runes to list
/// </summary>
public static List<Rune> ToList(this SpanRuneEnumerator enumerator)
{
var runes = new List<Rune>();
foreach (var rune in enumerator)
{
runes.Add(rune);
}
return runes;
}

}
8 changes: 4 additions & 4 deletions src/Tokenizer/XLMRobertaTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,15 @@ public override List<Token> TokenizeToTokens(Token tokenRef)

// Manually replacing whitespace characters
var newText = new StringBuilder();
foreach (var c in token.Text.EnumerateRunes())
foreach (var c in token.Text.AsSpan().EnumerateRunes().ToList())
{
newText.Append(TokenizationUtils.IsWhitespace(c) ? new Rune(Constants.LowerOneEighthBlock) : c.ToString());
}
token.Text = newText.ToString();
token.Text = newText.ToString().ToCharArray();

if (!token.Text.StartsWith(Constants.LowerOneEighthBlock))
if (!new string(token.Text).StartsWith(Constants.LowerOneEighthBlock))
{
token.Text = Constants.LowerOneEighthBlock + token.Text;
token.Text = (Constants.LowerOneEighthBlock.ToString() + new string(token.Text)).ToCharArray();
var newReferenceOffsets = new List<uint> { 0 };
newReferenceOffsets.AddRange(token.ReferenceOffsets);
token.ReferenceOffsets = newReferenceOffsets;
Expand Down
Loading