From a40f05d60bade7bfc6637019f8ea92b5b62fdc37 Mon Sep 17 00:00:00 2001
From: Abhishek Rai <abhishekrai4560@gmail.com>
Date: Mon, 26 Jan 2026 14:07:12 +0000
Subject: [PATCH 1/4] Add regex tokenizer

Revert accidental formatting changes

Revert accidental formatting changes in exact module

Final fixes

l
---
 libpz/include/RegexTokenizer.hpp | 144 ++++++++++
 libpz/include/pz_cxx_std.hpp     |   1 +
 libpz/regex/RegexTokenizer.cpp   | 437 +++++++++++++++++++++++++++++++
 3 files changed, 582 insertions(+)
 create mode 100644 libpz/include/RegexTokenizer.hpp
 create mode 100644 libpz/regex/RegexTokenizer.cpp
diff --git a/libpz/include/RegexTokenizer.hpp b/libpz/include/RegexTokenizer.hpp
new file mode 100644
index 0000000..1b87e78
--- /dev/null
+++ b/libpz/include/RegexTokenizer.hpp
@@ -0,0 +1,144 @@
+#ifndef REGEX_TOKENIZER_HPP
+#define REGEX_TOKENIZER_HPP
+
+#include <pz_cxx_std.hpp>
+
+/**
+ * @brief Types of tokens produced by the regex tokenizer.
+ */
+enum class TokenType {
+  /** Literal character like 'a', 'b', etc. */
+  LITERAL,
+
+  /** '.' wildcard */
+  DOT,
+
+  /** '*' operator */
+  STAR,
+
+  /** '+' operator */
+  PLUS,
+
+  /** '?' operator */
+  QUESTION,
+
+  /** '|' alternation */
+  ALTERNATION,
+
+  /** '(' opening group */
+  LPAREN,
+
+  /** ')' closing group */
+  RPAREN,
+
+  /** '^' start anchor */
+  CARET,
+
+  /** '$' end anchor */
+  DOLLAR,
+
+  /** Character class: '[...]', \d, \w, \s, etc */
+  CHAR_CLASS,
+
+  /** Quantifier range: '{m,n}', '{m,}', '{m}' */
+  QUANTIFIER_RANGE,
+
+  /** End of pattern */
+  END,
+
+  /** Implicit concatenation */
+  CONCAT
+};
+
+/**
+ * @brief Represents a character range [lo, hi].
+ */
+struct CharRange {
+  /** Lower bound */
+  char lo;
+
+  /** Upper bound */
+  char hi;
+};
+
+/**
+ * @brief A single token in the regex.
+ */
+struct Token {
+  /** Token category */
+  TokenType type;
+  /** Position in pattern (for error reporting) */
+  size_t pos;
+  /** Group ID for parentheses */
+  int group_id = -1;
+
+  /** Literal character value */
+  char literal = '\0';
+
+  /** Whether character class is negated */
+  bool negated = false;
+  /** Character ranges for character class */
+  std::vector<CharRange> ranges{};
+
+  /** Minimum repetitions for quantifier */
+  int min = 0;
+  /** Maximum repetitions (-1 means unbounded) */
+  int max = 0;
+};
+
+/**
+ * @brief Converts a regex pattern into a sequence of tokens.
+ */
+class Tokenizer {
+public:
+  /**
+   * @brief Construct tokenizer for a pattern.
+   * @param pat Regex pattern.
+   */
+  explicit Tokenizer(std::string_view pat);
+
+  /**
+   * @brief Tokenize the entire pattern.
+   * @return Vector of tokens ending with END token.
+   */
+  std::vector<Token> tokenize();
+
+private:
+  /** Input regex pattern */
+  std::string_view pattern;
+  /** Current cursor position */
+  size_t i = 0;
+  /** Counter for assigning group IDs */
+  int group_counter = 0;
+  /** Stack for nested group tracking */
+  std::stack<int> group_stack;
+
+  /** Peek next character without consuming */
+  char peek() const;
+  /** Consume next character */
+  char get();
+  /** Check for end of input */
+  bool eof() const;
+
+  /** Read next token */
+  Token next_token();
+  /** Read literal character */
+  Token read_literal(char);
+  /** Read escape sequence */
+  Token read_escape();
+  /** Read character class */
+  Token read_char_class();
+  /** Read quantifier range */
+  Token read_quantifier();
+
+  /** @brief Populates a token with ranges for \d, \w, \s, etc. */
+  void add_shorthand_ranges(char, Token &);
+
+  /** @brief Inserts implicit CONCAT tokens where concatenation occurs. */
+  void add_concat_tokens(std::vector<Token> &);
+
+  /** @brief Sorts and merges overlapping ranges for efficient NFA matching. */
+  void normalize_ranges(std::vector<CharRange> &);
+};
+
+#endif // REGEX_TOKENIZER_HPP
\ No newline at end of file
diff --git a/libpz/include/pz_cxx_std.hpp b/libpz/include/pz_cxx_std.hpp
index f4c0160..4b38066 100644
--- a/libpz/include/pz_cxx_std.hpp
+++ b/libpz/include/pz_cxx_std.hpp
@@ -11,6 +11,7 @@
 #include <optional>
 #include <set>
 #include <sstream>
+#include <stack>
 #include <string>
 #include <string_view>
 #include <unordered_map>
diff --git a/libpz/regex/RegexTokenizer.cpp b/libpz/regex/RegexTokenizer.cpp
new file mode 100644
index 0000000..cc12b17
--- /dev/null
+++ b/libpz/regex/RegexTokenizer.cpp
@@ -0,0 +1,437 @@
+#include "RegexTokenizer.hpp"
+#include "pz_error.hpp"
+
+Tokenizer::Tokenizer(std::string_view pat) : pattern(pat) {}
+
+char Tokenizer::peek() const { return eof() ? '\0' : pattern[i]; }
+
+char Tokenizer::get() { return eof() ? '\0' : pattern[i++]; }
+
+bool Tokenizer::eof() const { return i >= pattern.size(); }
+
+std::vector<Token> Tokenizer::tokenize() {
+  std::vector<Token> tokens;
+  while (!eof()) {
+    tokens.push_back(next_token());
+  }
+  tokens.push_back(Token{TokenType::END, i});
+  add_concat_tokens(tokens);
+  return tokens;
+}
+
+void Tokenizer::add_concat_tokens(std::vector<Token> &tokens) {
+  if (tokens.size() <= 2)
+    return;
+
+  std::vector<Token> normalized;
+  normalized.reserve(tokens.size() * 2);
+
+  for (size_t idx = 0; idx < tokens.size(); idx++) {
+    normalized.push_back(tokens[idx]);
+
+    if (idx + 1 >= tokens.size())
+      break;
+
+    const Token &current = tokens[idx];
+    const Token &next = tokens[idx + 1];
+
+    // Can the current token be the left side of a concatenation?
+    bool is_ender =
+        (current.type == TokenType::LITERAL || current.type == TokenType::DOT ||
+         current.type == TokenType::CHAR_CLASS ||
+         current.type == TokenType::RPAREN || current.type == TokenType::STAR ||
+         current.type == TokenType::PLUS ||
+         current.type == TokenType::QUESTION ||
+         current.type == TokenType::QUANTIFIER_RANGE ||
+         current.type == TokenType::CARET);
+
+    bool is_starter =
+        (next.type == TokenType::LITERAL || next.type == TokenType::DOT ||
+         next.type == TokenType::LPAREN || next.type == TokenType::CHAR_CLASS ||
+         next.type == TokenType::DOLLAR);
+
+    if (is_ender && is_starter) {
+      Token concat;
+      concat.type = TokenType::CONCAT;
+      concat.pos = current.pos;
+      normalized.push_back(concat);
+    }
+  }
+
+  tokens = std::move(normalized);
+}
+
+Token Tokenizer::next_token() {
+  char c = get();
+
+  // Position of the character that produced this token
+  size_t pos = i - 1;
+
+  switch (c) {
+  case '.':
+    return {TokenType::DOT, pos};
+  case '*':
+    return {TokenType::STAR, pos};
+  case '+':
+    return {TokenType::PLUS, pos};
+  case '?':
+    return {TokenType::QUESTION, pos};
+  case '|':
+    return {TokenType::ALTERNATION, pos};
+  case '(': {
+    int id = ++group_counter;
+    group_stack.push(id);
+    Token t{TokenType::LPAREN, pos};
+    t.group_id = id;
+    return t;
+  }
+  case ')': {
+    if (group_stack.empty())
+      PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                            "Mismatched ')' at position " +
+                                std::to_string(pos));
+    int id = group_stack.top();
+    group_stack.pop();
+    Token t{TokenType::RPAREN, pos};
+    t.group_id = id;
+    return t;
+  }
+  case '^':
+    return {TokenType::CARET, pos};
+  case '$':
+    return {TokenType::DOLLAR, pos};
+  case '\\':
+    return read_escape();
+  case '[':
+    return read_char_class();
+  case '{':
+    return read_quantifier();
+  default:
+    return read_literal(c);
+  }
+}
+
+Token Tokenizer::read_literal(char c) {
+  Token t{TokenType::LITERAL, i - 1};
+  t.literal = c;
+  return t;
+}
+
+Token Tokenizer::read_escape() {
+  if (eof())
+    PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                          "Dangling escape at end of input");
+
+  Token t;
+  t.pos = i - 1;
+  char c = get();
+
+  if (c == 'd' || c == 'D' || c == 'w' || c == 'W' || c == 's' || c == 'S') {
+    t.type = TokenType::CHAR_CLASS;
+    add_shorthand_ranges(c, t);
+    return t;
+  }
+
+  t.type = TokenType::LITERAL;
+  switch (c) {
+  case 'n':
+    t.literal = '\n';
+    break;
+  case 't':
+    t.literal = '\t';
+    break;
+  case 'r':
+    t.literal = '\r';
+    break;
+  case 'f':
+    t.literal = '\f';
+    break;
+  case 'v':
+    t.literal = '\v';
+    break;
+  default:
+    t.literal = c;
+    break;
+  }
+  return t;
+}
+
+void Tokenizer::add_shorthand_ranges(char c, Token &t) {
+  const char MIN_CHAR = '\0';   // ascii index 0
+  const char MAX_CHAR = '\x7F'; // ascii index 127
+  switch (c) {
+  case 'd':
+    t.ranges.push_back({'0', '9'});
+    break;
+  case 'D':
+    t.ranges.insert(t.ranges.end(),
+                    {
+                        {MIN_CHAR, '/'}, // Everything before '0'
+                        {':', MAX_CHAR}  // Everything after '9'
+                    });
+    break;
+  case 'w':
+    t.ranges.insert(t.ranges.end(),
+                    {{'a', 'z'}, {'A', 'Z'}, {'0', '9'}, {'_', '_'}});
+    break;
+  case 'W':
+    t.ranges.insert(t.ranges.end(), {
+                                        {MIN_CHAR, '/'}, // Before '0'
+                                        {':', '@'},      // Between '9' and 'A'
+                                        {'[', '^'},      // Between 'Z' and '_'
+                                        {'`', '`'},      // Between '_' and 'a'
+                                        {'{', MAX_CHAR}  // After 'z'
+                                    });
+    break;
+  case 's':
+    t.ranges.insert(t.ranges.end(), {{' ', ' '},
+                                     {'\t', '\t'},
+                                     {'\n', '\n'},
+                                     {'\r', '\r'},
+                                     {'\f', '\f'},
+                                     {'\v', '\v'}});
+    break;
+
+  case 'S':
+    t.ranges.insert(t.ranges.end(),
+                    {
+                        {MIN_CHAR, '\x08'}, // Before \t (0-8)
+                        {'\x0E', '\x1F'},   // Between \r and Space (14-31)
+                        {'!', MAX_CHAR}     // After Space (33-127)
+                    });
+    break;
+  }
+}
+
+void Tokenizer::normalize_ranges(std::vector<CharRange> &ranges) {
+  if (ranges.empty())
+    return;
+
+  std::sort(ranges.begin(), ranges.end(),
+            [](const CharRange &a, const CharRange &b) {
+              if (a.lo != b.lo)
+                return a.lo < b.lo;
+              return a.hi < b.hi;
+            });
+
+  size_t write = 0;
+
+  for (size_t read = 1; read < ranges.size(); ++read) {
+    CharRange &last = ranges[write];
+    const CharRange &cur = ranges[read];
+
+    if (cur.lo <= last.hi + 1) {
+      // merge into last
+      last.hi = std::max(last.hi, cur.hi);
+    } else {
+      // move cur to next write position
+      ++write;
+      ranges[write] = cur;
+    }
+  }
+
+  ranges.resize(write + 1);
+}
+
+Token Tokenizer::read_char_class() {
+  Token t{TokenType::CHAR_CLASS, i - 1};
+  if (peek() == '^') {
+    t.negated = true;
+    get();
+  }
+
+  bool have_prev = false;          // pending character for range
+  bool last_was_shorthand = false; // whether last token was \d, \w, etc.
+  char prev;
+
+  // Read until closing ']'
+  while (!eof() && peek() != ']') {
+    char c = get();
+    if (c == '\\') // Handle escape sequences
+    {
+      if (eof())
+        PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                              "Dangling escape in char class at position " +
+                                  std::to_string(i));
+      // Flush pending literal before escape
+      if (have_prev) {
+        t.ranges.push_back({prev, prev});
+        have_prev = false;
+      }
+      c = get();
+      switch (c) {
+      // Common escaped control characters
+      case 'n':
+        prev = '\n';
+        have_prev = true;
+        last_was_shorthand = false;
+        break;
+      case 't':
+        prev = '\t';
+        have_prev = true;
+        last_was_shorthand = false;
+        break;
+      case 'r':
+        prev = '\r';
+        have_prev = true;
+        last_was_shorthand = false;
+        break;
+      case 'f':
+        prev = '\f';
+        have_prev = true;
+        last_was_shorthand = false;
+        break;
+      case 'v':
+        prev = '\v';
+        have_prev = true;
+        last_was_shorthand = false;
+        break;
+
+      // Shorthand character classes
+      case 'd':
+      case 'w':
+      case 's':
+      case 'D':
+      case 'W':
+      case 'S': {
+        add_shorthand_ranges(c, t);
+        last_was_shorthand = true;
+        break;
+      }
+
+      // Escaped literal characters
+      default: {
+        prev = c;
+        have_prev = true;
+        last_was_shorthand = false;
+        break;
+      }
+      }
+      continue;
+    }
+
+    // Handle range syntax:
+    if (have_prev && c == '-' &&
+        peek() != ']') { // when '-' acts as a range specifier
+      char ub = get();
+      if (ub == '\\') // Handle escaped upper bound
+      {
+        if (eof())
+          PzError::report_error(
+              PzError::PzErrorType::PZ_INVALID_INPUT,
+              "Dangling escape in character range at position " +
+                  std::to_string(i));
+        ub = get();
+        if (ub == 'd' || ub == 'D' || ub == 'w' || ub == 'W' || ub == 's' ||
+            ub == 'S') {
+          PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                                "Cannot create a range with shorthand escape "
+                                "sequences at position " +
+                                    std::to_string(i - 1));
+        }
+      }
+      if (prev > ub)
+        PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                              "Invalid character range at position " +
+                                  std::to_string(i - 1));
+      t.ranges.push_back({prev, ub});
+      have_prev = false;
+      continue;
+    }
+    if (c == '-' && last_was_shorthand && peek() != ']') {
+      PzError::report_error(
+          PzError::PzErrorType::PZ_INVALID_INPUT,
+          "Cannot create a range with shorthand escape sequences at position " +
+              std::to_string(i - 1));
+    }
+
+    // Flush pending literal if no range follows
+    if (have_prev)
+      t.ranges.push_back({prev, prev});
+
+    prev = c;
+    have_prev = true;
+    last_was_shorthand = false;
+  }
+
+  // Missing closing ']'
+  if (eof())
+    PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                          "Unterminated character class starting at position " +
+                              std::to_string(t.pos));
+  if (have_prev)
+    t.ranges.push_back({prev, prev}); // Flush last pending character
+  if (t.ranges.empty())
+    PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                          "Empty char class starting at position " +
+                              std::to_string(t.pos)); // Disallow empty classes
+  get();                                              // consume ']'
+  normalize_ranges(t.ranges);
+  return t;
+}
+// NOTE: []] will be treated as an empty character class followed by a ] literal
+// In many regex implementations, it gets processed as a valid char class with
+// literal ']' but we currently treat the earliest found ] as the end of the
+// char class as a design choice. To use ] as a literal inside the char class,
+// user needs to escape it.
+
+Token Tokenizer::read_quantifier() {
+  // Position of '{' is stored in t.pos for error reporting
+  Token t{TokenType::QUANTIFIER_RANGE, i - 1};
+
+  auto skip_spaces = [&]() {
+    while (!eof() && std::isspace(peek())) {
+      get();
+    }
+  };
+
+  auto read_int = [&]() -> int {
+    skip_spaces();
+    int val = 0;
+    bool found = false;
+    while (!eof() && std::isdigit(peek())) {
+      found = true;
+      val = val * 10 + (get() - '0');
+    }
+    if (!found)
+      PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                            "Expected number in quantifier at position " +
+                                std::to_string(t.pos));
+    skip_spaces();
+    return val;
+  };
+
+  t.min = read_int();
+
+  if (peek() == '}') {
+    get();
+    t.max = t.min;
+    return t;
+  }
+
+  if (peek() != ',')
+    PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                          "Invalid quantifier syntax at position " +
+                              std::to_string(t.pos));
+  get();
+  skip_spaces();
+
+  if (peek() == '}') {
+    get();
+    t.max = -1;
+    return t;
+  }
+
+  t.max = read_int();
+  if (peek() != '}')
+    PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                          "Invalid quantifier syntax at position " +
+                              std::to_string(t.pos));
+  get();
+
+  if (t.max != -1 && t.max < t.min)
+    PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                          "Invalid quantifier range at position " +
+                              std::to_string(t.pos));
+  return t;
+}
\ No newline at end of file

From c1d2b486e83be8a923c10be9b218bdc1f7da9abf Mon Sep 17 00:00:00 2001
From: Abhishek Rai <abhishekrai4560@gmail.com>
Date: Tue, 27 Jan 2026 13:35:38 +0000
Subject: [PATCH 2/4] handle {,num} and use pz_types

---
 libpz/include/RegexTokenizer.hpp | 25 +++++-----
 libpz/regex/RegexTokenizer.cpp   | 78 +++++++++++++++-----------------
 2 files changed, 50 insertions(+), 53 deletions(-)

diff --git a/libpz/include/RegexTokenizer.hpp b/libpz/include/RegexTokenizer.hpp
index 1b87e78..5cd2173 100644
--- a/libpz/include/RegexTokenizer.hpp
+++ b/libpz/include/RegexTokenizer.hpp
@@ -2,6 +2,7 @@
 #define REGEX_TOKENIZER_HPP
 
 #include <pz_cxx_std.hpp>
+#include <pz_types.hpp>
 
 /**
  * @brief Types of tokens produced by the regex tokenizer.
@@ -55,10 +56,10 @@ enum class TokenType {
  */
 struct CharRange {
   /** Lower bound */
-  char lo;
+  ut8 lo;
 
   /** Upper bound */
-  char hi;
+  ut8 hi;
 };
 
 /**
@@ -70,10 +71,10 @@ struct Token {
   /** Position in pattern (for error reporting) */
   size_t pos;
   /** Group ID for parentheses */
-  int group_id = -1;
+  st32 group_id = -1;
 
   /** Literal character value */
-  char literal = '\0';
+  ut8 literal = '\0';
 
   /** Whether character class is negated */
   bool negated = false;
@@ -81,9 +82,9 @@ struct Token {
   std::vector<CharRange> ranges{};
 
   /** Minimum repetitions for quantifier */
-  int min = 0;
+  st32 min = 0;
   /** Maximum repetitions (-1 means unbounded) */
-  int max = 0;
+  st32 max = 0;
 };
 
 /**
@@ -109,21 +110,21 @@ class Tokenizer {
   /** Current cursor position */
   size_t i = 0;
   /** Counter for assigning group IDs */
-  int group_counter = 0;
+  st32 group_counter = 0;
   /** Stack for nested group tracking */
-  std::stack<int> group_stack;
+  std::stack<st32> group_stack;
 
   /** Peek next character without consuming */
-  char peek() const;
+  ut8 peek() const;
   /** Consume next character */
-  char get();
+  ut8 get();
   /** Check for end of input */
   bool eof() const;
 
   /** Read next token */
   Token next_token();
   /** Read literal character */
-  Token read_literal(char);
+  Token read_literal(ut8);
   /** Read escape sequence */
   Token read_escape();
   /** Read character class */
@@ -132,7 +133,7 @@ class Tokenizer {
   Token read_quantifier();
 
   /** @brief Populates a token with ranges for \d, \w, \s, etc. */
-  void add_shorthand_ranges(char, Token &);
+  void add_shorthand_ranges(ut8, Token &);
 
   /** @brief Inserts implicit CONCAT tokens where concatenation occurs. */
   void add_concat_tokens(std::vector<Token> &);
diff --git a/libpz/regex/RegexTokenizer.cpp b/libpz/regex/RegexTokenizer.cpp
index cc12b17..afd3203 100644
--- a/libpz/regex/RegexTokenizer.cpp
+++ b/libpz/regex/RegexTokenizer.cpp
@@ -3,9 +3,9 @@
 
 Tokenizer::Tokenizer(std::string_view pat) : pattern(pat) {}
 
-char Tokenizer::peek() const { return eof() ? '\0' : pattern[i]; }
+ut8 Tokenizer::peek() const { return eof() ? '\0' : pattern[i]; }
 
-char Tokenizer::get() { return eof() ? '\0' : pattern[i++]; }
+ut8 Tokenizer::get() { return eof() ? '\0' : pattern[i++]; }
 
 bool Tokenizer::eof() const { return i >= pattern.size(); }
 
@@ -62,7 +62,7 @@ void Tokenizer::add_concat_tokens(std::vector<Token> &tokens) {
 }
 
 Token Tokenizer::next_token() {
-  char c = get();
+  ut8 c = get();
 
   // Position of the character that produced this token
   size_t pos = i - 1;
@@ -79,7 +79,7 @@ Token Tokenizer::next_token() {
   case '|':
     return {TokenType::ALTERNATION, pos};
   case '(': {
-    int id = ++group_counter;
+    st32 id = ++group_counter;
     group_stack.push(id);
     Token t{TokenType::LPAREN, pos};
     t.group_id = id;
@@ -90,7 +90,7 @@ Token Tokenizer::next_token() {
       PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
                             "Mismatched ')' at position " +
                                 std::to_string(pos));
-    int id = group_stack.top();
+    st32 id = group_stack.top();
     group_stack.pop();
     Token t{TokenType::RPAREN, pos};
     t.group_id = id;
@@ -111,7 +111,7 @@ Token Tokenizer::next_token() {
   }
 }
 
-Token Tokenizer::read_literal(char c) {
+Token Tokenizer::read_literal(ut8 c) {
   Token t{TokenType::LITERAL, i - 1};
   t.literal = c;
   return t;
@@ -124,7 +124,7 @@ Token Tokenizer::read_escape() {
 
   Token t;
   t.pos = i - 1;
-  char c = get();
+  ut8 c = get();
 
   if (c == 'd' || c == 'D' || c == 'w' || c == 'W' || c == 's' || c == 'S') {
     t.type = TokenType::CHAR_CLASS;
@@ -156,49 +156,45 @@ Token Tokenizer::read_escape() {
   return t;
 }
 
-void Tokenizer::add_shorthand_ranges(char c, Token &t) {
-  const char MIN_CHAR = '\0';   // ascii index 0
-  const char MAX_CHAR = '\x7F'; // ascii index 127
+void Tokenizer::add_shorthand_ranges(ut8 c, Token &t) {
+  static constexpr ut8 MIN_CHAR = 0;         // ascii index 0
+  static constexpr ut8 MAX_CHAR = ASCII_MAX; // ascii index 127
   switch (c) {
   case 'd':
-    t.ranges.push_back({'0', '9'});
+    t.ranges.push_back({48, 57}); // '0' - '9'
     break;
   case 'D':
-    t.ranges.insert(t.ranges.end(),
-                    {
-                        {MIN_CHAR, '/'}, // Everything before '0'
-                        {':', MAX_CHAR}  // Everything after '9'
-                    });
+    t.ranges.insert(t.ranges.end(), {
+                                        {MIN_CHAR, 47}, // Everything before '0'
+                                        {58, MAX_CHAR}  // Everything after '9'
+                                    });
     break;
   case 'w':
-    t.ranges.insert(t.ranges.end(),
-                    {{'a', 'z'}, {'A', 'Z'}, {'0', '9'}, {'_', '_'}});
+    t.ranges.insert(
+        t.ranges.end(),
+        {{97, 122}, {65, 90}, {48, 57}, {95, 95}}); // a-z, A-Z, 0-9, _
     break;
   case 'W':
     t.ranges.insert(t.ranges.end(), {
-                                        {MIN_CHAR, '/'}, // Before '0'
-                                        {':', '@'},      // Between '9' and 'A'
-                                        {'[', '^'},      // Between 'Z' and '_'
-                                        {'`', '`'},      // Between '_' and 'a'
-                                        {'{', MAX_CHAR}  // After 'z'
+                                        {MIN_CHAR, 47}, // Before '0'
+                                        {58, 64},       // Between '9' and 'A'
+                                        {91, 94},       // Between 'Z' and '_'
+                                        {96, 96},       // Between '_' and 'a'
+                                        {123, MAX_CHAR} // After 'z'
                                     });
     break;
   case 's':
-    t.ranges.insert(t.ranges.end(), {{' ', ' '},
-                                     {'\t', '\t'},
-                                     {'\n', '\n'},
-                                     {'\r', '\r'},
-                                     {'\f', '\f'},
-                                     {'\v', '\v'}});
+    t.ranges.insert(t.ranges.end(), {{32, 32}, // Space
+                                     {9, 13}}  // \t, \n, \v, \f, \r
+    );
     break;
 
   case 'S':
-    t.ranges.insert(t.ranges.end(),
-                    {
-                        {MIN_CHAR, '\x08'}, // Before \t (0-8)
-                        {'\x0E', '\x1F'},   // Between \r and Space (14-31)
-                        {'!', MAX_CHAR}     // After Space (33-127)
-                    });
+    t.ranges.insert(t.ranges.end(), {
+                                        {MIN_CHAR, 8}, // Before \t
+                                        {14, 31},      // Between \r and Space
+                                        {33, MAX_CHAR} // After Space
+                                    });
     break;
   }
 }
@@ -242,11 +238,11 @@ Token Tokenizer::read_char_class() {
 
   bool have_prev = false;          // pending character for range
   bool last_was_shorthand = false; // whether last token was \d, \w, etc.
-  char prev;
+  ut8 prev;
 
   // Read until closing ']'
   while (!eof() && peek() != ']') {
-    char c = get();
+    ut8 c = get();
     if (c == '\\') // Handle escape sequences
     {
       if (eof())
@@ -313,7 +309,7 @@ Token Tokenizer::read_char_class() {
     // Handle range syntax:
     if (have_prev && c == '-' &&
         peek() != ']') { // when '-' acts as a range specifier
-      char ub = get();
+      ut8 ub = get();
       if (ub == '\\') // Handle escaped upper bound
       {
         if (eof())
@@ -385,15 +381,15 @@ Token Tokenizer::read_quantifier() {
     }
   };
 
-  auto read_int = [&]() -> int {
+  auto read_int = [&]() -> st32 {
     skip_spaces();
-    int val = 0;
+    st32 val = 0;
     bool found = false;
     while (!eof() && std::isdigit(peek())) {
       found = true;
       val = val * 10 + (get() - '0');
     }
-    if (!found)
+    if (!found && peek() != ',')
       PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
                             "Expected number in quantifier at position " +
                                 std::to_string(t.pos));

From 96692dd7835860f671da4b4d9597a016a8fdd627 Mon Sep 17 00:00:00 2001
From: Abhishek Rai <abhishekrai4560@gmail.com>
Date: Fri, 30 Jan 2026 10:43:39 +0000
Subject: [PATCH 3/4] add postfix conversion of tokens

---
 libpz/include/RegexPostfix.hpp |  28 ++++++++
 libpz/regex/RegexPostfix.cpp   | 126 +++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)
 create mode 100644 libpz/include/RegexPostfix.hpp
 create mode 100644 libpz/regex/RegexPostfix.cpp

diff --git a/libpz/include/RegexPostfix.hpp b/libpz/include/RegexPostfix.hpp
new file mode 100644
index 0000000..ce7a76d
--- /dev/null
+++ b/libpz/include/RegexPostfix.hpp
@@ -0,0 +1,28 @@
+#ifndef REGEX_POSTFIX_HPP
+#define REGEX_POSTFIX_HPP
+
+#include <RegexTokenizer.hpp>
+#include <pz_cxx_std.hpp>
+#include <pz_types.hpp>
+
+/**
+ * @brief Converts regex tokens from infix to postfix (RPN) form.
+ *
+ * This conversion is used as a preprocessing step before NFA construction.
+ * The class is stateless and intended to be used via its static methods.
+ */
+class Postfix {
+public:
+  /**
+   * @brief Convert an infix token sequence into postfix order.
+   */
+  static std::vector<Token> convert(const std::vector<Token> &infix);
+
+private:
+  /**
+   * @brief Returns precedence of a regex operator token.
+   */
+  static st32 get_precedence(TokenType type);
+};
+
+#endif // REGEX_POSTFIX_HPP
\ No newline at end of file
diff --git a/libpz/regex/RegexPostfix.cpp b/libpz/regex/RegexPostfix.cpp
new file mode 100644
index 0000000..9c51a83
--- /dev/null
+++ b/libpz/regex/RegexPostfix.cpp
@@ -0,0 +1,126 @@
+#include "RegexPostfix.hpp"
+#include "pz_error.hpp"
+
+st32 Postfix::get_precedence(TokenType type) {
+  switch (type) {
+  case TokenType::STAR:
+  case TokenType::PLUS:
+  case TokenType::QUESTION:
+  case TokenType::QUANTIFIER_RANGE:
+    return 3; // Unary postfix operators
+  case TokenType::CONCAT:
+    return 2; // Implicit concatenation
+  case TokenType::ALTERNATION:
+    return 1; // Lowest precedence
+  default:
+    return 0;
+  }
+}
+
+std::vector<Token> Postfix::convert(const std::vector<Token> &infix) {
+  std::vector<Token> postfix;
+  std::stack<Token> operators;
+  TokenType last_type = TokenType::END; // Tracks previous token for validation
+
+  for (const auto &t : infix) {
+    switch (t.type) {
+    // Operands go directly to output
+    case TokenType::LITERAL:
+    case TokenType::DOT:
+    case TokenType::CHAR_CLASS:
+    case TokenType::CARET:
+    case TokenType::DOLLAR:
+      postfix.push_back(t);
+      break;
+
+    // '(' is pushed to operator stack and output (for NFA grouping)
+    case TokenType::LPAREN: {
+      postfix.push_back(t);
+      operators.push(t);
+      break;
+    }
+
+    // Pop operators until matching '(' is found
+    case TokenType::RPAREN: {
+      if (last_type == TokenType::LPAREN)
+        PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                              "Empty Parentheses at position " +
+                                  std::to_string(t.pos));
+      while (!operators.empty() && operators.top().type != TokenType::LPAREN) {
+        postfix.push_back(operators.top());
+        operators.pop();
+      }
+      if (operators.empty())
+        PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                              "Mismatched ')' at position " +
+                                  std::to_string(t.pos));
+      operators.pop(); // Discard '('
+      postfix.push_back(t);
+      break;
+    }
+    // Unary postfix operators must follow a valid expression
+    case TokenType::STAR:
+    case TokenType::PLUS:
+    case TokenType::QUESTION:
+    case TokenType::QUANTIFIER_RANGE:
+      if (last_type != TokenType::LITERAL && last_type != TokenType::DOT &&
+          last_type != TokenType::CHAR_CLASS &&
+          last_type != TokenType::RPAREN) {
+        PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                              "Quantifier used without a valid preceding "
+                              "expression at position " +
+                                  std::to_string(t.pos));
+      }
+      postfix.push_back(t);
+      break;
+
+    case TokenType::ALTERNATION:
+      // '|' must separate two valid expressions
+      if (last_type == TokenType::END || last_type == TokenType::LPAREN ||
+          last_type == TokenType::ALTERNATION) {
+        PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                              "Invalid '|' at position " +
+                                  std::to_string(t.pos) +
+                                  ". It must separate two expressions.");
+      }
+      goto push_operator;
+
+    // Binary operators handled via precedence rules
+    case TokenType::CONCAT:
+    push_operator:
+      while (!operators.empty() && operators.top().type != TokenType::LPAREN &&
+             get_precedence(operators.top().type) >= get_precedence(t.type)) {
+        postfix.push_back(operators.top());
+        operators.pop();
+      }
+      operators.push(t);
+      break;
+
+    default:
+      break;
+    }
+
+    if (t.type != TokenType::END)
+      last_type = t.type;
+  }
+
+  // Pattern must not end with a binary operator
+  if (last_type == TokenType::ALTERNATION || last_type == TokenType::CONCAT) {
+    PzError::report_error(
+        PzError::PzErrorType::PZ_INVALID_INPUT,
+        "Trailing binary operator at end of pattern at position " +
+            std::to_string(infix.back().pos));
+  }
+
+  // Drain remaining operators
+  while (!operators.empty()) {
+    if (operators.top().type == TokenType::LPAREN)
+      PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
+                            "Unmatched '(' at position " +
+                                std::to_string(operators.top().pos));
+    postfix.push_back(operators.top());
+    operators.pop();
+  }
+
+  return postfix;
+}
\ No newline at end of file

From fcd2558f9339de6865486263c3af8ae9a1593ace Mon Sep 17 00:00:00 2001
From: Abhishek Rai <abhishekrai4560@gmail.com>
Date: Fri, 30 Jan 2026 16:32:16 +0000
Subject: [PATCH 4/4] add nfa builder, other minor changes-cursor_pos+std::move
 in add_concat_tokens()

std::move in add_concat_tokens() in RegexTokenizer.cpp
---
 libpz/include/Nfa.hpp            | 112 ++++++++++++
 libpz/include/NfaBuilder.hpp     |  62 +++++++
 libpz/include/RegexTokenizer.hpp |   2 +-
 libpz/regex/NfaBuilder.cpp       | 298 +++++++++++++++++++++++++++++++
 libpz/regex/RegexTokenizer.cpp   |  32 ++--
 5 files changed, 489 insertions(+), 17 deletions(-)
 create mode 100644 libpz/include/Nfa.hpp
 create mode 100644 libpz/include/NfaBuilder.hpp
 create mode 100644 libpz/regex/NfaBuilder.cpp

diff --git a/libpz/include/Nfa.hpp b/libpz/include/Nfa.hpp
new file mode 100644
index 0000000..bb011cb
--- /dev/null
+++ b/libpz/include/Nfa.hpp
@@ -0,0 +1,112 @@
+#ifndef NFA_HPP
+#define NFA_HPP
+
+#include <RegexTokenizer.hpp>
+#include <pz_cxx_std.hpp>
+#include <pz_types.hpp>
+
+/**
+ * @brief Types of NFA states used in regex matching.
+ */
+enum class StateType {
+  /** Match a single literal character */
+  CHAR,
+
+  /** Match any character (.) */
+  DOT,
+
+  /** Match a character class ([...]) */
+  CHAR_CLASS,
+
+  /** Accepting (final) state */
+  MATCH,
+
+  /** ε-transition with two outgoing branches */
+  SPLIT,
+
+  /** Save input position (for capture groups) */
+  SAVE,
+
+  /** Start-of-input anchor (^) */
+  ANCHOR_START,
+
+  /** End-of-input anchor ($) */
+  ANCHOR_END
+};
+
+/**
+ * @brief Represents a single state in the NFA.
+ */
+struct State {
+  StateType type;
+
+  /** Literal character to match (valid only for CHAR states, unspecified
+   * otherwise). */
+  ut8 c;
+
+  /** Capture group identifier (used by SAVE states to store input positions).
+   */
+  st32 save_id = -1;
+  // Even IDs represent group start, odd IDs represent group end.
+
+  /** Character ranges for CHAR_CLASS states. */
+  std::vector<CharRange> ranges;
+  bool negated = false;
+
+  /** Primary outgoing transition. */
+  State *out = nullptr;
+
+  /** Secondary outgoing transition (used only by SPLIT states). */
+  State *out1 = nullptr;
+
+  /**
+   * @brief Marker used during NFA simulation.
+   *
+   * Prevents revisiting the same state multiple times in a single step,
+   * avoiding duplicate work and infinite ε-transition loops.
+   */
+  st32 last_list = -1;
+  // Marks whether this state has already been added to the current
+  // active-states list, preventing duplicate entries and infinite ε-transition
+  // loops
+
+  State(StateType t) : type(t) {}
+};
+
+/**
+ * @brief Represents a partially constructed NFA fragment.
+ *
+ * A fragment consists of:
+ *  - a start state
+ *  - a list of dangling outgoing transitions that must be patched later
+ */
+struct Frag {
+  State *start;
+
+  /** Addresses of state pointers that need to be connected later. */
+  std::vector<State **> out_ptrs;
+
+  /**
+   * @brief Construct a fragment with a single dangling exit.
+   */
+  Frag(State *s) : start(s) { out_ptrs.push_back(&s->out); }
+
+  /**
+   * @brief Construct a fragment with multiple dangling exits.
+   */
+  Frag(State *s, std::vector<State **> out) : start(s), out_ptrs(out) {}
+
+  /**
+   * @brief Patch all dangling exits to point to the given state.
+   */
+  void patch(State *s) {
+    for (auto &ptr : out_ptrs) {
+      if (ptr &&
+          !*ptr) { // Only patch if the pointer exists and is currently null
+        *ptr = s;
+      }
+    }
+  }
+};
+
+#endif // NFA_HPP
\ No newline at end of file
diff --git a/libpz/include/NfaBuilder.hpp b/libpz/include/NfaBuilder.hpp
new file mode 100644
index 0000000..7d640eb
--- /dev/null
+++ b/libpz/include/NfaBuilder.hpp
@@ -0,0 +1,62 @@
+#ifndef NFA_BUILDER_HPP
+#define NFA_BUILDER_HPP
+
+#include <Nfa.hpp>
+
+/**
+ * @brief Builds an ε-NFA from a postfix regex token sequence.
+ *
+ * Implements Thompson-style construction to convert postfix regex tokens
+ * into an NFA graph. All states created during construction are owned
+ * internally and cleaned up automatically.
+ */
+class NfaBuilder {
+public:
+  /**
+   * @brief Build an NFA from a postfix regex.
+   *
+   * The resulting NFA has a single accepting state of type
+   * StateType::MATCH. The returned pointer refers to the start state.
+   *
+   * @param postfix Regex tokens in postfix (RPN) form.
+   * @return Pointer to the start state of the constructed NFA.
+   */
+  State *build(const std::vector<Token> &postfix);
+
+  /**
+   * @brief Create a deep copy of an NFA fragment.
+   *
+   * Used for handling quantifiers that require duplication of subgraphs
+   * (e.g. {m,n}, *, +).
+   */
+  Frag copy_fragment(Frag);
+
+  /**
+   * @brief Deep copy an NFA subgraph starting from a given state.
+   *
+   * Keeps a lookup map to avoid duplicating already-copied states.
+   *
+   * @param s Original state to copy.
+   * @param lookup Map from original states to their copies.
+   * @return Pointer to the copied state.
+   */
+  State *copy_state(State *, std::unordered_map<State *, State *> &);
+
+private:
+  /**
+   * @brief Allocate a new NFA state and store it in the internal pool.
+   *
+   * Ownership is retained by the builder to ensure correct lifetime.
+   */
+  State *create_state(StateType type);
+
+  /**
+   * @brief Owns all NFA states created during construction.
+   *
+   * Ensures that all State objects remain valid for the lifetime
+   * of the NfaBuilder and are automatically destroyed via RAII.
+   */
+  std::vector<std::unique_ptr<State>> state_pool;
+};
+
+#endif // NFA_BUILDER_HPP
\ No newline at end of file
diff --git a/libpz/include/RegexTokenizer.hpp b/libpz/include/RegexTokenizer.hpp
index 5cd2173..8ca69ca 100644
--- a/libpz/include/RegexTokenizer.hpp
+++ b/libpz/include/RegexTokenizer.hpp
@@ -108,7 +108,7 @@ class Tokenizer {
   /** Input regex pattern */
   std::string_view pattern;
   /** Current cursor position */
-  size_t i = 0;
+  size_t cursor_pos = 0;
   /** Counter for assigning group IDs */
   st32 group_counter = 0;
   /** Stack for nested group tracking */
diff --git a/libpz/regex/NfaBuilder.cpp b/libpz/regex/NfaBuilder.cpp
new file mode 100644
index 0000000..fb9285a
--- /dev/null
+++ b/libpz/regex/NfaBuilder.cpp
@@ -0,0 +1,298 @@
+#include "NfaBuilder.hpp"
+#include "pz_error.hpp"
+
+// Allocate a new NFA state, keep ownership in the builder(state pool),
+// and return a raw pointer to the state.
+State *NfaBuilder::create_state(StateType type) {
+  state_pool.push_back(std::make_unique<State>(type));
+  return state_pool.back().get();
+}
+
+// Create a deep copy of an NFA fragment.
+// All states are duplicated except MATCH states, which are shared.
+Frag NfaBuilder::copy_fragment(Frag original) {
+  std::unordered_map<State *, State *>
+      old_to_new; // stores the states we have already visited and its cloned
+                  // copies
+  State *new_start = copy_state(original.start, old_to_new);
+
+  std::vector<State **> new_exits;
+
+  // Traverse copied graph to collect dangling exits
+  std::unordered_set<State *>
+      visited; // Remember which states have been already visited
+  std::stack<State *> s;
+  s.push(new_start);
+  while (!s.empty()) // Loop until there are no more states left to process
+  {
+    State *curr = s.top();
+    s.pop();
+    if (!curr || visited.count(curr))
+      continue;
+    visited.insert(curr);
+    // If out is null, it's a dangling exit we need to patch later (Unpatched
+    // primary exit)
+    if (!curr->out && curr->type != StateType::MATCH) {
+      new_exits.push_back(&curr->out);
+    }
+    // If out1 is null (and it's a SPLIT state), it's also an exit (Unpatched
+    // secondary exit for SPLIT states)
+    if (!curr->out1 && curr->type == StateType::SPLIT) {
+      new_exits.push_back(&curr->out1);
+    }
+
+    if (curr->out)
+      s.push(curr->out);
+    if (curr->out1)
+      s.push(curr->out1);
+  }
+
+  return Frag(new_start, new_exits);
+}
+
+// Recursively clone an NFA subgraph starting from state 's'.
+// The 'lookup' map ensures that each original state is copied exactly once.
+// This preserves shared structure and prevents infinite recursion on cycles.
+// MATCH states are not duplicated: a copied fragment always reconnects to
+// the same final MATCH state during patching.
+State *NfaBuilder::copy_state(State *s,
+                              std::unordered_map<State *, State *> &lookup) {
+
+  // Null state or final MATCH state: return as-is
+  if (!s || s->type == StateType::MATCH)
+    return s;
+
+  // If this state was already copied, reuse the existing clone
+  if (lookup.count(s))
+    return lookup[s];
+
+  // Create a new state with the same semantic properties
+  State *result = create_state(s->type);
+  result->c = s->c;
+  result->ranges = s->ranges;
+  result->negated = s->negated;
+  result->save_id = s->save_id;
+
+  // Record the mapping before recursing to handle cycles correctly
+  lookup[s] = result;
+
+  // Recursively copy outgoing transitions
+  result->out = copy_state(s->out, lookup);
+  result->out1 = copy_state(s->out1, lookup);
+  return result;
+}
+
+// Build an ε-NFA from a postfix (RPN) regex token sequence.
+// The algorithm processes tokens left-to-right, maintaining a stack of
+// NFA fragments. Each operator combines or transforms fragments according
+// to standard Thompson construction rules. At the end, all dangling exits
+// are patched to a single MATCH state.
+State *NfaBuilder::build(const std::vector<Token> &postfix) {
+  std::stack<Frag> stack;
+
+  for (const auto &t : postfix) {
+    switch (t.type) {
+
+      // Atomic expressions:
+
+    case TokenType::LITERAL: {
+      State *s = create_state(StateType::CHAR);
+      s->c = t.literal;
+      stack.push(Frag(s));
+      break;
+    }
+    case TokenType::DOT: {
+      stack.push(Frag(create_state(StateType::DOT)));
+      break;
+    }
+    case TokenType::CHAR_CLASS: {
+      State *s = create_state(StateType::CHAR_CLASS);
+      s->ranges = t.ranges;
+      s->negated = t.negated;
+      stack.push(Frag(s));
+      break;
+    }
+    case TokenType::CARET: {
+      stack.push(Frag(create_state(StateType::ANCHOR_START)));
+      break;
+    }
+    case TokenType::DOLLAR: {
+      stack.push(Frag(create_state(StateType::ANCHOR_END)));
+      break;
+    }
+
+      // Capture groups:
+
+    case TokenType::LPAREN: {
+      State *s = create_state(StateType::SAVE);
+      s->save_id = t.group_id * 2; // capture start (even)
+      stack.push(Frag(s));
+      break;
+    }
+    case TokenType::RPAREN: {
+      // Create the save (end) state
+      State *s = create_state(StateType::SAVE);
+      s->save_id = t.group_id * 2 + 1; // capture end (odd)
+
+      // Extract the content of the group along with save (start)
+      Frag content = stack.top();
+      stack.pop();
+      Frag lparen_frag = stack.top();
+      stack.pop();
+      lparen_frag.patch(content.start);
+      content.patch(s);
+
+      // Push the whole fragment
+      stack.push(Frag(lparen_frag.start, {&s->out}));
+      break;
+    }
+
+      // Binary operators:
+
+    case TokenType::CONCAT: {
+      Frag e2 = stack.top();
+      stack.pop();
+      Frag e1 = stack.top();
+      stack.pop();
+      e1.patch(e2.start);
+      stack.push(Frag(e1.start, e2.out_ptrs));
+      break;
+    }
+    case TokenType::ALTERNATION: {
+      Frag e2 = stack.top();
+      stack.pop();
+      Frag e1 = stack.top();
+      stack.pop();
+      State *s = create_state(StateType::SPLIT);
+      s->out = e1.start;
+      s->out1 = e2.start;
+      // Combine dangling exits from both branches
+      std::vector<State **> combined = e1.out_ptrs;
+      combined.insert(combined.end(), e2.out_ptrs.begin(), e2.out_ptrs.end());
+      stack.push(Frag(s, combined));
+      break;
+    }
+
+      // Unary operators:
+
+    case TokenType::STAR: {
+      Frag e = stack.top();
+      stack.pop();
+      State *s = create_state(StateType::SPLIT);
+      s->out = e.start; // Loop back into the expression
+      e.patch(s);       // The expression's end loops back to the split
+      stack.push(Frag(s, {&s->out1})); // out1 is the escape route
+      break;
+    }
+    case TokenType::PLUS: {
+      Frag e = stack.top();
+      stack.pop();
+      State *s = create_state(StateType::SPLIT);
+      s->out = e.start; // Loop back
+      e.patch(s);       // Connect expression end to split
+      stack.push(Frag(e.start, {&s->out1}));
+      break;
+    }
+    case TokenType::QUESTION: {
+      Frag e = stack.top();
+      stack.pop();
+      State *s = create_state(StateType::SPLIT);
+      s->out = e.start; // Option 1: match the expression
+      // Option 2: skip the expression (out1)
+      std::vector<State **> exits = e.out_ptrs;
+      exits.push_back(&s->out1);
+      stack.push(Frag(s, exits));
+      break;
+    }
+
+      // Bounded repetition:
+
+    case TokenType::QUANTIFIER_RANGE: {
+      Frag e = stack.top();
+      stack.pop();
+
+      // i) Handle the mandatory part (m)
+      // Initialize 'mandatory' with an immediately-invoked lambda (no valid
+      // default state).
+      Frag mandatory = [&]() {
+        if (t.min == 0) {
+          State *eps = create_state(StateType::SPLIT);
+          return Frag(eps, {&eps->out});
+        } else {
+          return copy_fragment(e); // Use the first one as the base
+        }
+      }();
+      // If min > 1, append the necessary copies
+      for (int i = 1; i < t.min; i++) {
+        Frag next_copy = copy_fragment(e);
+        mandatory.patch(next_copy.start);
+        mandatory = Frag(mandatory.start, next_copy.out_ptrs);
+      }
+
+      // ii) Handle the optional part (n - m) or infinite (m, )
+      if (t.max == -1) { // {m,}
+        State *s = create_state(StateType::SPLIT);
+        Frag loop_part = copy_fragment(e);
+
+        s->out = loop_part.start;
+        loop_part.patch(s);
+
+        mandatory.patch(s);
+        stack.push(Frag(mandatory.start, {&s->out1}));
+      } else if (t.max > t.min) { // {m,n}
+        // Build a chain of optional fragments, each one guarded by a SPLIT that
+        // can either take the repetition or skip it and move on
+        Frag optional_chain = mandatory;
+        std::vector<State **> all_exits;
+
+        for (int i = 0; i < (t.max - t.min); i++) {
+          Frag next_opt = copy_fragment(e);
+          State *s = create_state(StateType::SPLIT);
+
+          s->out = next_opt.start;
+          optional_chain.patch(s);
+
+          // Collect exits from the skip path
+          all_exits.push_back(&s->out1);
+
+          optional_chain = Frag(next_opt.start, next_opt.out_ptrs);
+        }
+        // Add exits from the last repetition: if all optional parts are taken,
+        // the match can continue after the final copied fragment.
+        all_exits.insert(all_exits.end(), optional_chain.out_ptrs.begin(),
+                         optional_chain.out_ptrs.end());
+        stack.push(Frag(mandatory.start, all_exits));
+      } else { // {m}
+        stack.push(mandatory);
+      }
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  // Empty regex produces an ε-NFA (No fragments)
+  if (stack.empty()) {
+    State *s = create_state(StateType::SPLIT);
+    stack.push(Frag(s));
+  }
+
+  // Implicit concatenation of remaining fragments
+  while (stack.size() > 1) {
+    Frag e2 = stack.top();
+    stack.pop();
+    Frag e1 = stack.top();
+    stack.pop();
+    e1.patch(e2.start);
+    stack.push(Frag(e1.start, e2.out_ptrs));
+  }
+
+  // Patch all remaining exits to the final MATCH state
+  Frag final_frag = stack.top();
+  stack.pop();
+  State *match_state = create_state(StateType::MATCH);
+  final_frag.patch(match_state);
+
+  return final_frag.start;
+}
\ No newline at end of file
diff --git a/libpz/regex/RegexTokenizer.cpp b/libpz/regex/RegexTokenizer.cpp
index afd3203..5e9b3c2 100644
--- a/libpz/regex/RegexTokenizer.cpp
+++ b/libpz/regex/RegexTokenizer.cpp
@@ -3,18 +3,18 @@
 
 Tokenizer::Tokenizer(std::string_view pat) : pattern(pat) {}
 
-ut8 Tokenizer::peek() const { return eof() ? '\0' : pattern[i]; }
+ut8 Tokenizer::peek() const { return eof() ? '\0' : pattern[cursor_pos]; }
 
-ut8 Tokenizer::get() { return eof() ? '\0' : pattern[i++]; }
+ut8 Tokenizer::get() { return eof() ? '\0' : pattern[cursor_pos++]; }
 
-bool Tokenizer::eof() const { return i >= pattern.size(); }
+bool Tokenizer::eof() const { return cursor_pos >= pattern.size(); }
 
 std::vector<Token> Tokenizer::tokenize() {
   std::vector<Token> tokens;
   while (!eof()) {
     tokens.push_back(next_token());
   }
-  tokens.push_back(Token{TokenType::END, i});
+  tokens.push_back(Token{TokenType::END, cursor_pos});
   add_concat_tokens(tokens);
   return tokens;
 }
@@ -27,12 +27,12 @@ void Tokenizer::add_concat_tokens(std::vector<Token> &tokens) {
   normalized.reserve(tokens.size() * 2);
 
   for (size_t idx = 0; idx < tokens.size(); idx++) {
-    normalized.push_back(tokens[idx]);
+    normalized.push_back(std::move(tokens[idx]));
 
     if (idx + 1 >= tokens.size())
       break;
 
-    const Token &current = tokens[idx];
+    const Token &current = normalized.back();
     const Token &next = tokens[idx + 1];
 
     // Can the current token be the left side of a concatenation?
@@ -65,7 +65,7 @@ Token Tokenizer::next_token() {
   ut8 c = get();
 
   // Position of the character that produced this token
-  size_t pos = i - 1;
+  size_t pos = cursor_pos - 1;
 
   switch (c) {
   case '.':
@@ -112,7 +112,7 @@ Token Tokenizer::next_token() {
 }
 
 Token Tokenizer::read_literal(ut8 c) {
-  Token t{TokenType::LITERAL, i - 1};
+  Token t{TokenType::LITERAL, cursor_pos - 1};
   t.literal = c;
   return t;
 }
@@ -123,7 +123,7 @@ Token Tokenizer::read_escape() {
                           "Dangling escape at end of input");
 
   Token t;
-  t.pos = i - 1;
+  t.pos = cursor_pos - 1;
   ut8 c = get();
 
   if (c == 'd' || c == 'D' || c == 'w' || c == 'W' || c == 's' || c == 'S') {
@@ -230,7 +230,7 @@ void Tokenizer::normalize_ranges(std::vector<CharRange> &ranges) {
 }
 
 Token Tokenizer::read_char_class() {
-  Token t{TokenType::CHAR_CLASS, i - 1};
+  Token t{TokenType::CHAR_CLASS, cursor_pos - 1};
   if (peek() == '^') {
     t.negated = true;
     get();
@@ -248,7 +248,7 @@ Token Tokenizer::read_char_class() {
       if (eof())
         PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
                               "Dangling escape in char class at position " +
-                                  std::to_string(i));
+                                  std::to_string(cursor_pos));
       // Flush pending literal before escape
       if (have_prev) {
         t.ranges.push_back({prev, prev});
@@ -316,20 +316,20 @@ Token Tokenizer::read_char_class() {
           PzError::report_error(
               PzError::PzErrorType::PZ_INVALID_INPUT,
               "Dangling escape in character range at position " +
-                  std::to_string(i));
+                  std::to_string(cursor_pos));
         ub = get();
         if (ub == 'd' || ub == 'D' || ub == 'w' || ub == 'W' || ub == 's' ||
             ub == 'S') {
           PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
                                 "Cannot create a range with shorthand escape "
                                 "sequences at position " +
-                                    std::to_string(i - 1));
+                                    std::to_string(cursor_pos - 1));
         }
       }
       if (prev > ub)
         PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT,
                               "Invalid character range at position " +
-                                  std::to_string(i - 1));
+                                  std::to_string(cursor_pos - 1));
       t.ranges.push_back({prev, ub});
       have_prev = false;
       continue;
@@ -338,7 +338,7 @@ Token Tokenizer::read_char_class() {
       PzError::report_error(
           PzError::PzErrorType::PZ_INVALID_INPUT,
           "Cannot create a range with shorthand escape sequences at position " +
-              std::to_string(i - 1));
+              std::to_string(cursor_pos - 1));
     }
 
     // Flush pending literal if no range follows
@@ -373,7 +373,7 @@ Token Tokenizer::read_char_class() {
 
 Token Tokenizer::read_quantifier() {
   // Position of '{' is stored in t.pos for error reporting
-  Token t{TokenType::QUANTIFIER_RANGE, i - 1};
+  Token t{TokenType::QUANTIFIER_RANGE, cursor_pos - 1};
 
   auto skip_spaces = [&]() {
     while (!eof() && std::isspace(peek())) {