From a40f05d60bade7bfc6637019f8ea92b5b62fdc37 Mon Sep 17 00:00:00 2001 From: Abhishek Rai Date: Mon, 26 Jan 2026 14:07:12 +0000 Subject: [PATCH 1/4] Add regex tokenizer Revert accidental formatting changes Revert accidental formatting changes in exact module Final fixes l --- libpz/include/RegexTokenizer.hpp | 144 ++++++++++ libpz/include/pz_cxx_std.hpp | 1 + libpz/regex/RegexTokenizer.cpp | 437 +++++++++++++++++++++++++++++++ 3 files changed, 582 insertions(+) create mode 100644 libpz/include/RegexTokenizer.hpp create mode 100644 libpz/regex/RegexTokenizer.cpp diff --git a/libpz/include/RegexTokenizer.hpp b/libpz/include/RegexTokenizer.hpp new file mode 100644 index 0000000..1b87e78 --- /dev/null +++ b/libpz/include/RegexTokenizer.hpp @@ -0,0 +1,144 @@ +#ifndef REGEX_TOKENIZER_HPP +#define REGEX_TOKENIZER_HPP + +#include + +/** + * @brief Types of tokens produced by the regex tokenizer. + */ +enum class TokenType { + /** Literal character like 'a', 'b', etc. */ + LITERAL, + + /** '.' wildcard */ + DOT, + + /** '*' operator */ + STAR, + + /** '+' operator */ + PLUS, + + /** '?' operator */ + QUESTION, + + /** '|' alternation */ + ALTERNATION, + + /** '(' opening group */ + LPAREN, + + /** ')' closing group */ + RPAREN, + + /** '^' start anchor */ + CARET, + + /** '$' end anchor */ + DOLLAR, + + /** Character class: '[...]', \d, \w, \s, etc */ + CHAR_CLASS, + + /** Quantifier range: '{m,n}', '{m,}', '{m}' */ + QUANTIFIER_RANGE, + + /** End of pattern */ + END, + + /** Implicit concatenation */ + CONCAT +}; + +/** + * @brief Represents a character range [lo, hi]. + */ +struct CharRange { + /** Lower bound */ + char lo; + + /** Upper bound */ + char hi; +}; + +/** + * @brief A single token in the regex. + */ +struct Token { + /** Token category */ + TokenType type; + /** Position in pattern (for error reporting) */ + size_t pos; + /** Group ID for parentheses */ + int group_id = -1; + + /** Literal character value */ + char literal = '\0'; + + /** Whether character class is negated */ + bool negated = false; + /** Character ranges for character class */ + std::vector ranges{}; + + /** Minimum repetitions for quantifier */ + int min = 0; + /** Maximum repetitions (-1 means unbounded) */ + int max = 0; +}; + +/** + * @brief Converts a regex pattern into a sequence of tokens. + */ +class Tokenizer { +public: + /** + * @brief Construct tokenizer for a pattern. + * @param pat Regex pattern. + */ + explicit Tokenizer(std::string_view pat); + + /** + * @brief Tokenize the entire pattern. + * @return Vector of tokens ending with END token. + */ + std::vector tokenize(); + +private: + /** Input regex pattern */ + std::string_view pattern; + /** Current cursor position */ + size_t i = 0; + /** Counter for assigning group IDs */ + int group_counter = 0; + /** Stack for nested group tracking */ + std::stack group_stack; + + /** Peek next character without consuming */ + char peek() const; + /** Consume next character */ + char get(); + /** Check for end of input */ + bool eof() const; + + /** Read next token */ + Token next_token(); + /** Read literal character */ + Token read_literal(char); + /** Read escape sequence */ + Token read_escape(); + /** Read character class */ + Token read_char_class(); + /** Read quantifier range */ + Token read_quantifier(); + + /** @brief Populates a token with ranges for \d, \w, \s, etc. */ + void add_shorthand_ranges(char, Token &); + + /** @brief Inserts implicit CONCAT tokens where concatenation occurs. */ + void add_concat_tokens(std::vector &); + + /** @brief Sorts and merges overlapping ranges for efficient NFA matching. */ + void normalize_ranges(std::vector &); +}; + +#endif // REGEX_TOKENIZER_HPP \ No newline at end of file diff --git a/libpz/include/pz_cxx_std.hpp b/libpz/include/pz_cxx_std.hpp index f4c0160..4b38066 100644 --- a/libpz/include/pz_cxx_std.hpp +++ b/libpz/include/pz_cxx_std.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/libpz/regex/RegexTokenizer.cpp b/libpz/regex/RegexTokenizer.cpp new file mode 100644 index 0000000..cc12b17 --- /dev/null +++ b/libpz/regex/RegexTokenizer.cpp @@ -0,0 +1,437 @@ +#include "RegexTokenizer.hpp" +#include "pz_error.hpp" + +Tokenizer::Tokenizer(std::string_view pat) : pattern(pat) {} + +char Tokenizer::peek() const { return eof() ? '\0' : pattern[i]; } + +char Tokenizer::get() { return eof() ? '\0' : pattern[i++]; } + +bool Tokenizer::eof() const { return i >= pattern.size(); } + +std::vector Tokenizer::tokenize() { + std::vector tokens; + while (!eof()) { + tokens.push_back(next_token()); + } + tokens.push_back(Token{TokenType::END, i}); + add_concat_tokens(tokens); + return tokens; +} + +void Tokenizer::add_concat_tokens(std::vector &tokens) { + if (tokens.size() <= 2) + return; + + std::vector normalized; + normalized.reserve(tokens.size() * 2); + + for (size_t idx = 0; idx < tokens.size(); idx++) { + normalized.push_back(tokens[idx]); + + if (idx + 1 >= tokens.size()) + break; + + const Token ¤t = tokens[idx]; + const Token &next = tokens[idx + 1]; + + // Can the current token be the left side of a concatenation? + bool is_ender = + (current.type == TokenType::LITERAL || current.type == TokenType::DOT || + current.type == TokenType::CHAR_CLASS || + current.type == TokenType::RPAREN || current.type == TokenType::STAR || + current.type == TokenType::PLUS || + current.type == TokenType::QUESTION || + current.type == TokenType::QUANTIFIER_RANGE || + current.type == TokenType::CARET); + + bool is_starter = + (next.type == TokenType::LITERAL || next.type == TokenType::DOT || + next.type == TokenType::LPAREN || next.type == TokenType::CHAR_CLASS || + next.type == TokenType::DOLLAR); + + if (is_ender && is_starter) { + Token concat; + concat.type = TokenType::CONCAT; + concat.pos = current.pos; + normalized.push_back(concat); + } + } + + tokens = std::move(normalized); +} + +Token Tokenizer::next_token() { + char c = get(); + + // Position of the character that produced this token + size_t pos = i - 1; + + switch (c) { + case '.': + return {TokenType::DOT, pos}; + case '*': + return {TokenType::STAR, pos}; + case '+': + return {TokenType::PLUS, pos}; + case '?': + return {TokenType::QUESTION, pos}; + case '|': + return {TokenType::ALTERNATION, pos}; + case '(': { + int id = ++group_counter; + group_stack.push(id); + Token t{TokenType::LPAREN, pos}; + t.group_id = id; + return t; + } + case ')': { + if (group_stack.empty()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Mismatched ')' at position " + + std::to_string(pos)); + int id = group_stack.top(); + group_stack.pop(); + Token t{TokenType::RPAREN, pos}; + t.group_id = id; + return t; + } + case '^': + return {TokenType::CARET, pos}; + case '$': + return {TokenType::DOLLAR, pos}; + case '\\': + return read_escape(); + case '[': + return read_char_class(); + case '{': + return read_quantifier(); + default: + return read_literal(c); + } +} + +Token Tokenizer::read_literal(char c) { + Token t{TokenType::LITERAL, i - 1}; + t.literal = c; + return t; +} + +Token Tokenizer::read_escape() { + if (eof()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Dangling escape at end of input"); + + Token t; + t.pos = i - 1; + char c = get(); + + if (c == 'd' || c == 'D' || c == 'w' || c == 'W' || c == 's' || c == 'S') { + t.type = TokenType::CHAR_CLASS; + add_shorthand_ranges(c, t); + return t; + } + + t.type = TokenType::LITERAL; + switch (c) { + case 'n': + t.literal = '\n'; + break; + case 't': + t.literal = '\t'; + break; + case 'r': + t.literal = '\r'; + break; + case 'f': + t.literal = '\f'; + break; + case 'v': + t.literal = '\v'; + break; + default: + t.literal = c; + break; + } + return t; +} + +void Tokenizer::add_shorthand_ranges(char c, Token &t) { + const char MIN_CHAR = '\0'; // ascii index 0 + const char MAX_CHAR = '\x7F'; // ascii index 127 + switch (c) { + case 'd': + t.ranges.push_back({'0', '9'}); + break; + case 'D': + t.ranges.insert(t.ranges.end(), + { + {MIN_CHAR, '/'}, // Everything before '0' + {':', MAX_CHAR} // Everything after '9' + }); + break; + case 'w': + t.ranges.insert(t.ranges.end(), + {{'a', 'z'}, {'A', 'Z'}, {'0', '9'}, {'_', '_'}}); + break; + case 'W': + t.ranges.insert(t.ranges.end(), { + {MIN_CHAR, '/'}, // Before '0' + {':', '@'}, // Between '9' and 'A' + {'[', '^'}, // Between 'Z' and '_' + {'`', '`'}, // Between '_' and 'a' + {'{', MAX_CHAR} // After 'z' + }); + break; + case 's': + t.ranges.insert(t.ranges.end(), {{' ', ' '}, + {'\t', '\t'}, + {'\n', '\n'}, + {'\r', '\r'}, + {'\f', '\f'}, + {'\v', '\v'}}); + break; + + case 'S': + t.ranges.insert(t.ranges.end(), + { + {MIN_CHAR, '\x08'}, // Before \t (0-8) + {'\x0E', '\x1F'}, // Between \r and Space (14-31) + {'!', MAX_CHAR} // After Space (33-127) + }); + break; + } +} + +void Tokenizer::normalize_ranges(std::vector &ranges) { + if (ranges.empty()) + return; + + std::sort(ranges.begin(), ranges.end(), + [](const CharRange &a, const CharRange &b) { + if (a.lo != b.lo) + return a.lo < b.lo; + return a.hi < b.hi; + }); + + size_t write = 0; + + for (size_t read = 1; read < ranges.size(); ++read) { + CharRange &last = ranges[write]; + const CharRange &cur = ranges[read]; + + if (cur.lo <= last.hi + 1) { + // merge into last + last.hi = std::max(last.hi, cur.hi); + } else { + // move cur to next write position + ++write; + ranges[write] = cur; + } + } + + ranges.resize(write + 1); +} + +Token Tokenizer::read_char_class() { + Token t{TokenType::CHAR_CLASS, i - 1}; + if (peek() == '^') { + t.negated = true; + get(); + } + + bool have_prev = false; // pending character for range + bool last_was_shorthand = false; // whether last token was \d, \w, etc. + char prev; + + // Read until closing ']' + while (!eof() && peek() != ']') { + char c = get(); + if (c == '\\') // Handle escape sequences + { + if (eof()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Dangling escape in char class at position " + + std::to_string(i)); + // Flush pending literal before escape + if (have_prev) { + t.ranges.push_back({prev, prev}); + have_prev = false; + } + c = get(); + switch (c) { + // Common escaped control characters + case 'n': + prev = '\n'; + have_prev = true; + last_was_shorthand = false; + break; + case 't': + prev = '\t'; + have_prev = true; + last_was_shorthand = false; + break; + case 'r': + prev = '\r'; + have_prev = true; + last_was_shorthand = false; + break; + case 'f': + prev = '\f'; + have_prev = true; + last_was_shorthand = false; + break; + case 'v': + prev = '\v'; + have_prev = true; + last_was_shorthand = false; + break; + + // Shorthand character classes + case 'd': + case 'w': + case 's': + case 'D': + case 'W': + case 'S': { + add_shorthand_ranges(c, t); + last_was_shorthand = true; + break; + } + + // Escaped literal characters + default: { + prev = c; + have_prev = true; + last_was_shorthand = false; + break; + } + } + continue; + } + + // Handle range syntax: + if (have_prev && c == '-' && + peek() != ']') { // when '-' acts as a range specifier + char ub = get(); + if (ub == '\\') // Handle escaped upper bound + { + if (eof()) + PzError::report_error( + PzError::PzErrorType::PZ_INVALID_INPUT, + "Dangling escape in character range at position " + + std::to_string(i)); + ub = get(); + if (ub == 'd' || ub == 'D' || ub == 'w' || ub == 'W' || ub == 's' || + ub == 'S') { + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Cannot create a range with shorthand escape " + "sequences at position " + + std::to_string(i - 1)); + } + } + if (prev > ub) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Invalid character range at position " + + std::to_string(i - 1)); + t.ranges.push_back({prev, ub}); + have_prev = false; + continue; + } + if (c == '-' && last_was_shorthand && peek() != ']') { + PzError::report_error( + PzError::PzErrorType::PZ_INVALID_INPUT, + "Cannot create a range with shorthand escape sequences at position " + + std::to_string(i - 1)); + } + + // Flush pending literal if no range follows + if (have_prev) + t.ranges.push_back({prev, prev}); + + prev = c; + have_prev = true; + last_was_shorthand = false; + } + + // Missing closing ']' + if (eof()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Unterminated character class starting at position " + + std::to_string(t.pos)); + if (have_prev) + t.ranges.push_back({prev, prev}); // Flush last pending character + if (t.ranges.empty()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Empty char class starting at position " + + std::to_string(t.pos)); // Disallow empty classes + get(); // consume ']' + normalize_ranges(t.ranges); + return t; +} +// NOTE: []] will be treated as an empty character class followed by a ] literal +// In many regex implementations, it gets processed as a valid char class with +// literal ']' but we currently treat the earliest found ] as the end of the +// char class as a design choice. To use ] as a literal inside the char class, +// user needs to escape it. + +Token Tokenizer::read_quantifier() { + // Position of '{' is stored in t.pos for error reporting + Token t{TokenType::QUANTIFIER_RANGE, i - 1}; + + auto skip_spaces = [&]() { + while (!eof() && std::isspace(peek())) { + get(); + } + }; + + auto read_int = [&]() -> int { + skip_spaces(); + int val = 0; + bool found = false; + while (!eof() && std::isdigit(peek())) { + found = true; + val = val * 10 + (get() - '0'); + } + if (!found) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Expected number in quantifier at position " + + std::to_string(t.pos)); + skip_spaces(); + return val; + }; + + t.min = read_int(); + + if (peek() == '}') { + get(); + t.max = t.min; + return t; + } + + if (peek() != ',') + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Invalid quantifier syntax at position " + + std::to_string(t.pos)); + get(); + skip_spaces(); + + if (peek() == '}') { + get(); + t.max = -1; + return t; + } + + t.max = read_int(); + if (peek() != '}') + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Invalid quantifier syntax at position " + + std::to_string(t.pos)); + get(); + + if (t.max != -1 && t.max < t.min) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Invalid quantifier range at position " + + std::to_string(t.pos)); + return t; +} \ No newline at end of file From c1d2b486e83be8a923c10be9b218bdc1f7da9abf Mon Sep 17 00:00:00 2001 From: Abhishek Rai Date: Tue, 27 Jan 2026 13:35:38 +0000 Subject: [PATCH 2/4] handle {,num} and use pz_types --- libpz/include/RegexTokenizer.hpp | 25 +++++----- libpz/regex/RegexTokenizer.cpp | 78 +++++++++++++++----------------- 2 files changed, 50 insertions(+), 53 deletions(-) diff --git a/libpz/include/RegexTokenizer.hpp b/libpz/include/RegexTokenizer.hpp index 1b87e78..5cd2173 100644 --- a/libpz/include/RegexTokenizer.hpp +++ b/libpz/include/RegexTokenizer.hpp @@ -2,6 +2,7 @@ #define REGEX_TOKENIZER_HPP #include +#include /** * @brief Types of tokens produced by the regex tokenizer. @@ -55,10 +56,10 @@ enum class TokenType { */ struct CharRange { /** Lower bound */ - char lo; + ut8 lo; /** Upper bound */ - char hi; + ut8 hi; }; /** @@ -70,10 +71,10 @@ struct Token { /** Position in pattern (for error reporting) */ size_t pos; /** Group ID for parentheses */ - int group_id = -1; + st32 group_id = -1; /** Literal character value */ - char literal = '\0'; + ut8 literal = '\0'; /** Whether character class is negated */ bool negated = false; @@ -81,9 +82,9 @@ struct Token { std::vector ranges{}; /** Minimum repetitions for quantifier */ - int min = 0; + st32 min = 0; /** Maximum repetitions (-1 means unbounded) */ - int max = 0; + st32 max = 0; }; /** @@ -109,21 +110,21 @@ class Tokenizer { /** Current cursor position */ size_t i = 0; /** Counter for assigning group IDs */ - int group_counter = 0; + st32 group_counter = 0; /** Stack for nested group tracking */ - std::stack group_stack; + std::stack group_stack; /** Peek next character without consuming */ - char peek() const; + ut8 peek() const; /** Consume next character */ - char get(); + ut8 get(); /** Check for end of input */ bool eof() const; /** Read next token */ Token next_token(); /** Read literal character */ - Token read_literal(char); + Token read_literal(ut8); /** Read escape sequence */ Token read_escape(); /** Read character class */ @@ -132,7 +133,7 @@ class Tokenizer { Token read_quantifier(); /** @brief Populates a token with ranges for \d, \w, \s, etc. */ - void add_shorthand_ranges(char, Token &); + void add_shorthand_ranges(ut8, Token &); /** @brief Inserts implicit CONCAT tokens where concatenation occurs. */ void add_concat_tokens(std::vector &); diff --git a/libpz/regex/RegexTokenizer.cpp b/libpz/regex/RegexTokenizer.cpp index cc12b17..afd3203 100644 --- a/libpz/regex/RegexTokenizer.cpp +++ b/libpz/regex/RegexTokenizer.cpp @@ -3,9 +3,9 @@ Tokenizer::Tokenizer(std::string_view pat) : pattern(pat) {} -char Tokenizer::peek() const { return eof() ? '\0' : pattern[i]; } +ut8 Tokenizer::peek() const { return eof() ? '\0' : pattern[i]; } -char Tokenizer::get() { return eof() ? '\0' : pattern[i++]; } +ut8 Tokenizer::get() { return eof() ? '\0' : pattern[i++]; } bool Tokenizer::eof() const { return i >= pattern.size(); } @@ -62,7 +62,7 @@ void Tokenizer::add_concat_tokens(std::vector &tokens) { } Token Tokenizer::next_token() { - char c = get(); + ut8 c = get(); // Position of the character that produced this token size_t pos = i - 1; @@ -79,7 +79,7 @@ Token Tokenizer::next_token() { case '|': return {TokenType::ALTERNATION, pos}; case '(': { - int id = ++group_counter; + st32 id = ++group_counter; group_stack.push(id); Token t{TokenType::LPAREN, pos}; t.group_id = id; @@ -90,7 +90,7 @@ Token Tokenizer::next_token() { PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, "Mismatched ')' at position " + std::to_string(pos)); - int id = group_stack.top(); + st32 id = group_stack.top(); group_stack.pop(); Token t{TokenType::RPAREN, pos}; t.group_id = id; @@ -111,7 +111,7 @@ Token Tokenizer::next_token() { } } -Token Tokenizer::read_literal(char c) { +Token Tokenizer::read_literal(ut8 c) { Token t{TokenType::LITERAL, i - 1}; t.literal = c; return t; @@ -124,7 +124,7 @@ Token Tokenizer::read_escape() { Token t; t.pos = i - 1; - char c = get(); + ut8 c = get(); if (c == 'd' || c == 'D' || c == 'w' || c == 'W' || c == 's' || c == 'S') { t.type = TokenType::CHAR_CLASS; @@ -156,49 +156,45 @@ Token Tokenizer::read_escape() { return t; } -void Tokenizer::add_shorthand_ranges(char c, Token &t) { - const char MIN_CHAR = '\0'; // ascii index 0 - const char MAX_CHAR = '\x7F'; // ascii index 127 +void Tokenizer::add_shorthand_ranges(ut8 c, Token &t) { + static constexpr ut8 MIN_CHAR = 0; // ascii index 0 + static constexpr ut8 MAX_CHAR = ASCII_MAX; // ascii index 127 switch (c) { case 'd': - t.ranges.push_back({'0', '9'}); + t.ranges.push_back({48, 57}); // '0' - '9' break; case 'D': - t.ranges.insert(t.ranges.end(), - { - {MIN_CHAR, '/'}, // Everything before '0' - {':', MAX_CHAR} // Everything after '9' - }); + t.ranges.insert(t.ranges.end(), { + {MIN_CHAR, 47}, // Everything before '0' + {58, MAX_CHAR} // Everything after '9' + }); break; case 'w': - t.ranges.insert(t.ranges.end(), - {{'a', 'z'}, {'A', 'Z'}, {'0', '9'}, {'_', '_'}}); + t.ranges.insert( + t.ranges.end(), + {{97, 122}, {65, 90}, {48, 57}, {95, 95}}); // a-z, A-Z, 0-9, _ break; case 'W': t.ranges.insert(t.ranges.end(), { - {MIN_CHAR, '/'}, // Before '0' - {':', '@'}, // Between '9' and 'A' - {'[', '^'}, // Between 'Z' and '_' - {'`', '`'}, // Between '_' and 'a' - {'{', MAX_CHAR} // After 'z' + {MIN_CHAR, 47}, // Before '0' + {58, 64}, // Between '9' and 'A' + {91, 94}, // Between 'Z' and '_' + {96, 96}, // Between '_' and 'a' + {123, MAX_CHAR} // After 'z' }); break; case 's': - t.ranges.insert(t.ranges.end(), {{' ', ' '}, - {'\t', '\t'}, - {'\n', '\n'}, - {'\r', '\r'}, - {'\f', '\f'}, - {'\v', '\v'}}); + t.ranges.insert(t.ranges.end(), {{32, 32}, // Space + {9, 13}} // \t, \n, \v, \f, \r + ); break; case 'S': - t.ranges.insert(t.ranges.end(), - { - {MIN_CHAR, '\x08'}, // Before \t (0-8) - {'\x0E', '\x1F'}, // Between \r and Space (14-31) - {'!', MAX_CHAR} // After Space (33-127) - }); + t.ranges.insert(t.ranges.end(), { + {MIN_CHAR, 8}, // Before \t + {14, 31}, // Between \r and Space + {33, MAX_CHAR} // After Space + }); break; } } @@ -242,11 +238,11 @@ Token Tokenizer::read_char_class() { bool have_prev = false; // pending character for range bool last_was_shorthand = false; // whether last token was \d, \w, etc. - char prev; + ut8 prev; // Read until closing ']' while (!eof() && peek() != ']') { - char c = get(); + ut8 c = get(); if (c == '\\') // Handle escape sequences { if (eof()) @@ -313,7 +309,7 @@ Token Tokenizer::read_char_class() { // Handle range syntax: if (have_prev && c == '-' && peek() != ']') { // when '-' acts as a range specifier - char ub = get(); + ut8 ub = get(); if (ub == '\\') // Handle escaped upper bound { if (eof()) @@ -385,15 +381,15 @@ Token Tokenizer::read_quantifier() { } }; - auto read_int = [&]() -> int { + auto read_int = [&]() -> st32 { skip_spaces(); - int val = 0; + st32 val = 0; bool found = false; while (!eof() && std::isdigit(peek())) { found = true; val = val * 10 + (get() - '0'); } - if (!found) + if (!found && peek() != ',') PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, "Expected number in quantifier at position " + std::to_string(t.pos)); From 96692dd7835860f671da4b4d9597a016a8fdd627 Mon Sep 17 00:00:00 2001 From: Abhishek Rai Date: Fri, 30 Jan 2026 10:43:39 +0000 Subject: [PATCH 3/4] add postfix conversion of tokens --- libpz/include/RegexPostfix.hpp | 28 ++++++++ libpz/regex/RegexPostfix.cpp | 126 +++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 libpz/include/RegexPostfix.hpp create mode 100644 libpz/regex/RegexPostfix.cpp diff --git a/libpz/include/RegexPostfix.hpp b/libpz/include/RegexPostfix.hpp new file mode 100644 index 0000000..ce7a76d --- /dev/null +++ b/libpz/include/RegexPostfix.hpp @@ -0,0 +1,28 @@ +#ifndef REGEX_POSTFIX_HPP +#define REGEX_POSTFIX_HPP + +#include +#include +#include + +/** + * @brief Converts regex tokens from infix to postfix (RPN) form. + * + * This conversion is used as a preprocessing step before NFA construction. + * The class is stateless and intended to be used via its static methods. + */ +class Postfix { +public: + /** + * @brief Convert an infix token sequence into postfix order. + */ + static std::vector convert(const std::vector &infix); + +private: + /** + * @brief Returns precedence of a regex operator token. + */ + static st32 get_precedence(TokenType type); +}; + +#endif // REGEX_POSTFIX_HPP \ No newline at end of file diff --git a/libpz/regex/RegexPostfix.cpp b/libpz/regex/RegexPostfix.cpp new file mode 100644 index 0000000..9c51a83 --- /dev/null +++ b/libpz/regex/RegexPostfix.cpp @@ -0,0 +1,126 @@ +#include "RegexPostfix.hpp" +#include "pz_error.hpp" + +st32 Postfix::get_precedence(TokenType type) { + switch (type) { + case TokenType::STAR: + case TokenType::PLUS: + case TokenType::QUESTION: + case TokenType::QUANTIFIER_RANGE: + return 3; // Unary postfix operators + case TokenType::CONCAT: + return 2; // Implicit concatenation + case TokenType::ALTERNATION: + return 1; // Lowest precedence + default: + return 0; + } +} + +std::vector Postfix::convert(const std::vector &infix) { + std::vector postfix; + std::stack operators; + TokenType last_type = TokenType::END; // Tracks previous token for validation + + for (const auto &t : infix) { + switch (t.type) { + // Operands go directly to output + case TokenType::LITERAL: + case TokenType::DOT: + case TokenType::CHAR_CLASS: + case TokenType::CARET: + case TokenType::DOLLAR: + postfix.push_back(t); + break; + + // '(' is pushed to operator stack and output (for NFA grouping) + case TokenType::LPAREN: { + postfix.push_back(t); + operators.push(t); + break; + } + + // Pop operators until matching '(' is found + case TokenType::RPAREN: { + if (last_type == TokenType::LPAREN) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Empty Parentheses at position " + + std::to_string(t.pos)); + while (!operators.empty() && operators.top().type != TokenType::LPAREN) { + postfix.push_back(operators.top()); + operators.pop(); + } + if (operators.empty()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Mismatched ')' at position " + + std::to_string(t.pos)); + operators.pop(); // Discard '(' + postfix.push_back(t); + break; + } + // Unary postfix operators must follow a valid expression + case TokenType::STAR: + case TokenType::PLUS: + case TokenType::QUESTION: + case TokenType::QUANTIFIER_RANGE: + if (last_type != TokenType::LITERAL && last_type != TokenType::DOT && + last_type != TokenType::CHAR_CLASS && + last_type != TokenType::RPAREN) { + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Quantifier used without a valid preceding " + "expression at position " + + std::to_string(t.pos)); + } + postfix.push_back(t); + break; + + case TokenType::ALTERNATION: + // '|' must separate two valid expressions + if (last_type == TokenType::END || last_type == TokenType::LPAREN || + last_type == TokenType::ALTERNATION) { + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Invalid '|' at position " + + std::to_string(t.pos) + + ". It must separate two expressions."); + } + goto push_operator; + + // Binary operators handled via precedence rules + case TokenType::CONCAT: + push_operator: + while (!operators.empty() && operators.top().type != TokenType::LPAREN && + get_precedence(operators.top().type) >= get_precedence(t.type)) { + postfix.push_back(operators.top()); + operators.pop(); + } + operators.push(t); + break; + + default: + break; + } + + if (t.type != TokenType::END) + last_type = t.type; + } + + // Pattern must not end with a binary operator + if (last_type == TokenType::ALTERNATION || last_type == TokenType::CONCAT) { + PzError::report_error( + PzError::PzErrorType::PZ_INVALID_INPUT, + "Trailing binary operator at end of pattern at position " + + std::to_string(infix.back().pos)); + } + + // Drain remaining operators + while (!operators.empty()) { + if (operators.top().type == TokenType::LPAREN) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Unmatched '(' at position " + + std::to_string(operators.top().pos)); + postfix.push_back(operators.top()); + operators.pop(); + } + + return postfix; +} \ No newline at end of file From fcd2558f9339de6865486263c3af8ae9a1593ace Mon Sep 17 00:00:00 2001 From: Abhishek Rai Date: Fri, 30 Jan 2026 16:32:16 +0000 Subject: [PATCH 4/4] add nfa builder, other minor changes-cursor_pos+std::move in add_concat_tokens() std::move in add_concat_tokens() in RegexTokenizer.cpp --- libpz/include/Nfa.hpp | 112 ++++++++++++ libpz/include/NfaBuilder.hpp | 62 +++++++ libpz/include/RegexTokenizer.hpp | 2 +- libpz/regex/NfaBuilder.cpp | 298 +++++++++++++++++++++++++++++++ libpz/regex/RegexTokenizer.cpp | 32 ++-- 5 files changed, 489 insertions(+), 17 deletions(-) create mode 100644 libpz/include/Nfa.hpp create mode 100644 libpz/include/NfaBuilder.hpp create mode 100644 libpz/regex/NfaBuilder.cpp diff --git a/libpz/include/Nfa.hpp b/libpz/include/Nfa.hpp new file mode 100644 index 0000000..bb011cb --- /dev/null +++ b/libpz/include/Nfa.hpp @@ -0,0 +1,112 @@ +#ifndef NFA_HPP +#define NFA_HPP + +#include +#include +#include + +/** + * @brief Types of NFA states used in regex matching. + */ +enum class StateType { + /** Match a single literal character */ + CHAR, + + /** Match any character (.) */ + DOT, + + /** Match a character class ([...]) */ + CHAR_CLASS, + + /** Accepting (final) state */ + MATCH, + + /** ε-transition with two outgoing branches */ + SPLIT, + + /** Save input position (for capture groups) */ + SAVE, + + /** Start-of-input anchor (^) */ + ANCHOR_START, + + /** End-of-input anchor ($) */ + ANCHOR_END +}; + +/** + * @brief Represents a single state in the NFA. + */ +struct State { + StateType type; + + /** Literal character to match (valid only for CHAR states, unspecified + * otherwise). */ + ut8 c; + + /** Capture group identifier (used by SAVE states to store input positions). + */ + st32 save_id = -1; + // Even IDs represent group start, odd IDs represent group end. + + /** Character ranges for CHAR_CLASS states. */ + std::vector ranges; + bool negated = false; + + /** Primary outgoing transition. */ + State *out = nullptr; + + /** Secondary outgoing transition (used only by SPLIT states). */ + State *out1 = nullptr; + + /** + * @brief Marker used during NFA simulation. + * + * Prevents revisiting the same state multiple times in a single step, + * avoiding duplicate work and infinite ε-transition loops. + */ + st32 last_list = -1; + // Marks whether this state has already been added to the current + // active-states list, preventing duplicate entries and infinite ε-transition + // loops + + State(StateType t) : type(t) {} +}; + +/** + * @brief Represents a partially constructed NFA fragment. + * + * A fragment consists of: + * - a start state + * - a list of dangling outgoing transitions that must be patched later + */ +struct Frag { + State *start; + + /** Addresses of state pointers that need to be connected later. */ + std::vector out_ptrs; + + /** + * @brief Construct a fragment with a single dangling exit. + */ + Frag(State *s) : start(s) { out_ptrs.push_back(&s->out); } + + /** + * @brief Construct a fragment with multiple dangling exits. + */ + Frag(State *s, std::vector out) : start(s), out_ptrs(out) {} + + /** + * @brief Patch all dangling exits to point to the given state. + */ + void patch(State *s) { + for (auto &ptr : out_ptrs) { + if (ptr && + !*ptr) { // Only patch if the pointer exists and is currently null + *ptr = s; + } + } + } +}; + +#endif // NFA_HPP \ No newline at end of file diff --git a/libpz/include/NfaBuilder.hpp b/libpz/include/NfaBuilder.hpp new file mode 100644 index 0000000..7d640eb --- /dev/null +++ b/libpz/include/NfaBuilder.hpp @@ -0,0 +1,62 @@ +#ifndef NFA_BUILDER_HPP +#define NFA_BUILDER_HPP + +#include + +/** + * @brief Builds an ε-NFA from a postfix regex token sequence. + * + * Implements Thompson-style construction to convert postfix regex tokens + * into an NFA graph. All states created during construction are owned + * internally and cleaned up automatically. + */ +class NfaBuilder { +public: + /** + * @brief Build an NFA from a postfix regex. + * + * The resulting NFA has a single accepting state of type + * StateType::MATCH. The returned pointer refers to the start state. + * + * @param postfix Regex tokens in postfix (RPN) form. + * @return Pointer to the start state of the constructed NFA. + */ + State *build(const std::vector &postfix); + + /** + * @brief Create a deep copy of an NFA fragment. + * + * Used for handling quantifiers that require duplication of subgraphs + * (e.g. {m,n}, *, +). + */ + Frag copy_fragment(Frag); + + /** + * @brief Deep copy an NFA subgraph starting from a given state. + * + * Keeps a lookup map to avoid duplicating already-copied states. + * + * @param s Original state to copy. + * @param lookup Map from original states to their copies. + * @return Pointer to the copied state. + */ + State *copy_state(State *, std::unordered_map &); + +private: + /** + * @brief Allocate a new NFA state and store it in the internal pool. + * + * Ownership is retained by the builder to ensure correct lifetime. + */ + State *create_state(StateType type); + + /** + * @brief Owns all NFA states created during construction. + * + * Ensures that all State objects remain valid for the lifetime + * of the NfaBuilder and are automatically destroyed via RAII. + */ + std::vector> state_pool; +}; + +#endif // NFA_BUILDER_HPP \ No newline at end of file diff --git a/libpz/include/RegexTokenizer.hpp b/libpz/include/RegexTokenizer.hpp index 5cd2173..8ca69ca 100644 --- a/libpz/include/RegexTokenizer.hpp +++ b/libpz/include/RegexTokenizer.hpp @@ -108,7 +108,7 @@ class Tokenizer { /** Input regex pattern */ std::string_view pattern; /** Current cursor position */ - size_t i = 0; + size_t cursor_pos = 0; /** Counter for assigning group IDs */ st32 group_counter = 0; /** Stack for nested group tracking */ diff --git a/libpz/regex/NfaBuilder.cpp b/libpz/regex/NfaBuilder.cpp new file mode 100644 index 0000000..fb9285a --- /dev/null +++ b/libpz/regex/NfaBuilder.cpp @@ -0,0 +1,298 @@ +#include "NfaBuilder.hpp" +#include "pz_error.hpp" + +// Allocate a new NFA state, keep ownership in the builder(state pool), +// and return a raw pointer to the state. +State *NfaBuilder::create_state(StateType type) { + state_pool.push_back(std::make_unique(type)); + return state_pool.back().get(); +} + +// Create a deep copy of an NFA fragment. +// All states are duplicated except MATCH states, which are shared. +Frag NfaBuilder::copy_fragment(Frag original) { + std::unordered_map + old_to_new; // stores the states we have already visited and its cloned + // copies + State *new_start = copy_state(original.start, old_to_new); + + std::vector new_exits; + + // Traverse copied graph to collect dangling exits + std::unordered_set + visited; // Remember which states have been already visited + std::stack s; + s.push(new_start); + while (!s.empty()) // Loop until there are no more states left to process + { + State *curr = s.top(); + s.pop(); + if (!curr || visited.count(curr)) + continue; + visited.insert(curr); + // If out is null, it's a dangling exit we need to patch later (Unpatched + // primary exit) + if (!curr->out && curr->type != StateType::MATCH) { + new_exits.push_back(&curr->out); + } + // If out1 is null (and it's a SPLIT state), it's also an exit (Unpatched + // secondary exit for SPLIT states) + if (!curr->out1 && curr->type == StateType::SPLIT) { + new_exits.push_back(&curr->out1); + } + + if (curr->out) + s.push(curr->out); + if (curr->out1) + s.push(curr->out1); + } + + return Frag(new_start, new_exits); +} + +// Recursively clone an NFA subgraph starting from state 's'. +// The 'lookup' map ensures that each original state is copied exactly once. +// This preserves shared structure and prevents infinite recursion on cycles. +// MATCH states are not duplicated: a copied fragment always reconnects to +// the same final MATCH state during patching. +State *NfaBuilder::copy_state(State *s, + std::unordered_map &lookup) { + + // Null state or final MATCH state: return as-is + if (!s || s->type == StateType::MATCH) + return s; + + // If this state was already copied, reuse the existing clone + if (lookup.count(s)) + return lookup[s]; + + // Create a new state with the same semantic properties + State *result = create_state(s->type); + result->c = s->c; + result->ranges = s->ranges; + result->negated = s->negated; + result->save_id = s->save_id; + + // Record the mapping before recursing to handle cycles correctly + lookup[s] = result; + + // Recursively copy outgoing transitions + result->out = copy_state(s->out, lookup); + result->out1 = copy_state(s->out1, lookup); + return result; +} + +// Build an ε-NFA from a postfix (RPN) regex token sequence. +// The algorithm processes tokens left-to-right, maintaining a stack of +// NFA fragments. Each operator combines or transforms fragments according +// to standard Thompson construction rules. At the end, all dangling exits +// are patched to a single MATCH state. +State *NfaBuilder::build(const std::vector &postfix) { + std::stack stack; + + for (const auto &t : postfix) { + switch (t.type) { + + // Atomic expressions: + + case TokenType::LITERAL: { + State *s = create_state(StateType::CHAR); + s->c = t.literal; + stack.push(Frag(s)); + break; + } + case TokenType::DOT: { + stack.push(Frag(create_state(StateType::DOT))); + break; + } + case TokenType::CHAR_CLASS: { + State *s = create_state(StateType::CHAR_CLASS); + s->ranges = t.ranges; + s->negated = t.negated; + stack.push(Frag(s)); + break; + } + case TokenType::CARET: { + stack.push(Frag(create_state(StateType::ANCHOR_START))); + break; + } + case TokenType::DOLLAR: { + stack.push(Frag(create_state(StateType::ANCHOR_END))); + break; + } + + // Capture groups: + + case TokenType::LPAREN: { + State *s = create_state(StateType::SAVE); + s->save_id = t.group_id * 2; // capture start (even) + stack.push(Frag(s)); + break; + } + case TokenType::RPAREN: { + // Create the save (end) state + State *s = create_state(StateType::SAVE); + s->save_id = t.group_id * 2 + 1; // capture end (odd) + + // Extract the content of the group along with save (start) + Frag content = stack.top(); + stack.pop(); + Frag lparen_frag = stack.top(); + stack.pop(); + lparen_frag.patch(content.start); + content.patch(s); + + // Push the whole fragment + stack.push(Frag(lparen_frag.start, {&s->out})); + break; + } + + // Binary operators: + + case TokenType::CONCAT: { + Frag e2 = stack.top(); + stack.pop(); + Frag e1 = stack.top(); + stack.pop(); + e1.patch(e2.start); + stack.push(Frag(e1.start, e2.out_ptrs)); + break; + } + case TokenType::ALTERNATION: { + Frag e2 = stack.top(); + stack.pop(); + Frag e1 = stack.top(); + stack.pop(); + State *s = create_state(StateType::SPLIT); + s->out = e1.start; + s->out1 = e2.start; + // Combine dangling exits from both branches + std::vector combined = e1.out_ptrs; + combined.insert(combined.end(), e2.out_ptrs.begin(), e2.out_ptrs.end()); + stack.push(Frag(s, combined)); + break; + } + + // Unary operators: + + case TokenType::STAR: { + Frag e = stack.top(); + stack.pop(); + State *s = create_state(StateType::SPLIT); + s->out = e.start; // Loop back into the expression + e.patch(s); // The expression's end loops back to the split + stack.push(Frag(s, {&s->out1})); // out1 is the escape route + break; + } + case TokenType::PLUS: { + Frag e = stack.top(); + stack.pop(); + State *s = create_state(StateType::SPLIT); + s->out = e.start; // Loop back + e.patch(s); // Connect expression end to split + stack.push(Frag(e.start, {&s->out1})); + break; + } + case TokenType::QUESTION: { + Frag e = stack.top(); + stack.pop(); + State *s = create_state(StateType::SPLIT); + s->out = e.start; // Option 1: match the expression + // Option 2: skip the expression (out1) + std::vector exits = e.out_ptrs; + exits.push_back(&s->out1); + stack.push(Frag(s, exits)); + break; + } + + // Bounded repetition: + + case TokenType::QUANTIFIER_RANGE: { + Frag e = stack.top(); + stack.pop(); + + // i) Handle the mandatory part (m) + // Initialize 'mandatory' with an immediately-invoked lambda (no valid + // default state). + Frag mandatory = [&]() { + if (t.min == 0) { + State *eps = create_state(StateType::SPLIT); + return Frag(eps, {&eps->out}); + } else { + return copy_fragment(e); // Use the first one as the base + } + }(); + // If min > 1, append the necessary copies + for (int i = 1; i < t.min; i++) { + Frag next_copy = copy_fragment(e); + mandatory.patch(next_copy.start); + mandatory = Frag(mandatory.start, next_copy.out_ptrs); + } + + // ii) Handle the optional part (n - m) or infinite (m, ) + if (t.max == -1) { // {m,} + State *s = create_state(StateType::SPLIT); + Frag loop_part = copy_fragment(e); + + s->out = loop_part.start; + loop_part.patch(s); + + mandatory.patch(s); + stack.push(Frag(mandatory.start, {&s->out1})); + } else if (t.max > t.min) { // {m,n} + // Build a chain of optional fragments, each one guarded by a SPLIT that + // can either take the repetition or skip it and move on + Frag optional_chain = mandatory; + std::vector all_exits; + + for (int i = 0; i < (t.max - t.min); i++) { + Frag next_opt = copy_fragment(e); + State *s = create_state(StateType::SPLIT); + + s->out = next_opt.start; + optional_chain.patch(s); + + // Collect exits from the skip path + all_exits.push_back(&s->out1); + + optional_chain = Frag(next_opt.start, next_opt.out_ptrs); + } + // Add exits from the last repetition: if all optional parts are taken, + // the match can continue after the final copied fragment. + all_exits.insert(all_exits.end(), optional_chain.out_ptrs.begin(), + optional_chain.out_ptrs.end()); + stack.push(Frag(mandatory.start, all_exits)); + } else { // {m} + stack.push(mandatory); + } + break; + } + default: + break; + } + } + + // Empty regex produces an ε-NFA (No fragments) + if (stack.empty()) { + State *s = create_state(StateType::SPLIT); + stack.push(Frag(s)); + } + + // Implicit concatenation of remaining fragments + while (stack.size() > 1) { + Frag e2 = stack.top(); + stack.pop(); + Frag e1 = stack.top(); + stack.pop(); + e1.patch(e2.start); + stack.push(Frag(e1.start, e2.out_ptrs)); + } + + // Patch all remaining exits to the final MATCH state + Frag final_frag = stack.top(); + stack.pop(); + State *match_state = create_state(StateType::MATCH); + final_frag.patch(match_state); + + return final_frag.start; +} \ No newline at end of file diff --git a/libpz/regex/RegexTokenizer.cpp b/libpz/regex/RegexTokenizer.cpp index afd3203..5e9b3c2 100644 --- a/libpz/regex/RegexTokenizer.cpp +++ b/libpz/regex/RegexTokenizer.cpp @@ -3,18 +3,18 @@ Tokenizer::Tokenizer(std::string_view pat) : pattern(pat) {} -ut8 Tokenizer::peek() const { return eof() ? '\0' : pattern[i]; } +ut8 Tokenizer::peek() const { return eof() ? '\0' : pattern[cursor_pos]; } -ut8 Tokenizer::get() { return eof() ? '\0' : pattern[i++]; } +ut8 Tokenizer::get() { return eof() ? '\0' : pattern[cursor_pos++]; } -bool Tokenizer::eof() const { return i >= pattern.size(); } +bool Tokenizer::eof() const { return cursor_pos >= pattern.size(); } std::vector Tokenizer::tokenize() { std::vector tokens; while (!eof()) { tokens.push_back(next_token()); } - tokens.push_back(Token{TokenType::END, i}); + tokens.push_back(Token{TokenType::END, cursor_pos}); add_concat_tokens(tokens); return tokens; } @@ -27,12 +27,12 @@ void Tokenizer::add_concat_tokens(std::vector &tokens) { normalized.reserve(tokens.size() * 2); for (size_t idx = 0; idx < tokens.size(); idx++) { - normalized.push_back(tokens[idx]); + normalized.push_back(std::move(tokens[idx])); if (idx + 1 >= tokens.size()) break; - const Token ¤t = tokens[idx]; + const Token ¤t = normalized.back(); const Token &next = tokens[idx + 1]; // Can the current token be the left side of a concatenation? @@ -65,7 +65,7 @@ Token Tokenizer::next_token() { ut8 c = get(); // Position of the character that produced this token - size_t pos = i - 1; + size_t pos = cursor_pos - 1; switch (c) { case '.': @@ -112,7 +112,7 @@ Token Tokenizer::next_token() { } Token Tokenizer::read_literal(ut8 c) { - Token t{TokenType::LITERAL, i - 1}; + Token t{TokenType::LITERAL, cursor_pos - 1}; t.literal = c; return t; } @@ -123,7 +123,7 @@ Token Tokenizer::read_escape() { "Dangling escape at end of input"); Token t; - t.pos = i - 1; + t.pos = cursor_pos - 1; ut8 c = get(); if (c == 'd' || c == 'D' || c == 'w' || c == 'W' || c == 's' || c == 'S') { @@ -230,7 +230,7 @@ void Tokenizer::normalize_ranges(std::vector &ranges) { } Token Tokenizer::read_char_class() { - Token t{TokenType::CHAR_CLASS, i - 1}; + Token t{TokenType::CHAR_CLASS, cursor_pos - 1}; if (peek() == '^') { t.negated = true; get(); @@ -248,7 +248,7 @@ Token Tokenizer::read_char_class() { if (eof()) PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, "Dangling escape in char class at position " + - std::to_string(i)); + std::to_string(cursor_pos)); // Flush pending literal before escape if (have_prev) { t.ranges.push_back({prev, prev}); @@ -316,20 +316,20 @@ Token Tokenizer::read_char_class() { PzError::report_error( PzError::PzErrorType::PZ_INVALID_INPUT, "Dangling escape in character range at position " + - std::to_string(i)); + std::to_string(cursor_pos)); ub = get(); if (ub == 'd' || ub == 'D' || ub == 'w' || ub == 'W' || ub == 's' || ub == 'S') { PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, "Cannot create a range with shorthand escape " "sequences at position " + - std::to_string(i - 1)); + std::to_string(cursor_pos - 1)); } } if (prev > ub) PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, "Invalid character range at position " + - std::to_string(i - 1)); + std::to_string(cursor_pos - 1)); t.ranges.push_back({prev, ub}); have_prev = false; continue; @@ -338,7 +338,7 @@ Token Tokenizer::read_char_class() { PzError::report_error( PzError::PzErrorType::PZ_INVALID_INPUT, "Cannot create a range with shorthand escape sequences at position " + - std::to_string(i - 1)); + std::to_string(cursor_pos - 1)); } // Flush pending literal if no range follows @@ -373,7 +373,7 @@ Token Tokenizer::read_char_class() { Token Tokenizer::read_quantifier() { // Position of '{' is stored in t.pos for error reporting - Token t{TokenType::QUANTIFIER_RANGE, i - 1}; + Token t{TokenType::QUANTIFIER_RANGE, cursor_pos - 1}; auto skip_spaces = [&]() { while (!eof() && std::isspace(peek())) {