diff --git a/libpz/include/Nfa.hpp b/libpz/include/Nfa.hpp new file mode 100644 index 0000000..bb011cb --- /dev/null +++ b/libpz/include/Nfa.hpp @@ -0,0 +1,112 @@ +#ifndef NFA_HPP +#define NFA_HPP + +#include +#include +#include + +/** + * @brief Types of NFA states used in regex matching. + */ +enum class StateType { + /** Match a single literal character */ + CHAR, + + /** Match any character (.) */ + DOT, + + /** Match a character class ([...]) */ + CHAR_CLASS, + + /** Accepting (final) state */ + MATCH, + + /** ε-transition with two outgoing branches */ + SPLIT, + + /** Save input position (for capture groups) */ + SAVE, + + /** Start-of-input anchor (^) */ + ANCHOR_START, + + /** End-of-input anchor ($) */ + ANCHOR_END +}; + +/** + * @brief Represents a single state in the NFA. + */ +struct State { + StateType type; + + /** Literal character to match (valid only for CHAR states, unspecified + * otherwise). */ + ut8 c; + + /** Capture group identifier (used by SAVE states to store input positions). + */ + st32 save_id = -1; + // Even IDs represent group start, odd IDs represent group end. + + /** Character ranges for CHAR_CLASS states. */ + std::vector ranges; + bool negated = false; + + /** Primary outgoing transition. */ + State *out = nullptr; + + /** Secondary outgoing transition (used only by SPLIT states). */ + State *out1 = nullptr; + + /** + * @brief Marker used during NFA simulation. + * + * Prevents revisiting the same state multiple times in a single step, + * avoiding duplicate work and infinite ε-transition loops. + */ + st32 last_list = -1; + // Marks whether this state has already been added to the current + // active-states list, preventing duplicate entries and infinite ε-transition + // loops + + State(StateType t) : type(t) {} +}; + +/** + * @brief Represents a partially constructed NFA fragment. + * + * A fragment consists of: + * - a start state + * - a list of dangling outgoing transitions that must be patched later + */ +struct Frag { + State *start; + + /** Addresses of state pointers that need to be connected later. */ + std::vector out_ptrs; + + /** + * @brief Construct a fragment with a single dangling exit. + */ + Frag(State *s) : start(s) { out_ptrs.push_back(&s->out); } + + /** + * @brief Construct a fragment with multiple dangling exits. + */ + Frag(State *s, std::vector out) : start(s), out_ptrs(out) {} + + /** + * @brief Patch all dangling exits to point to the given state. + */ + void patch(State *s) { + for (auto &ptr : out_ptrs) { + if (ptr && + !*ptr) { // Only patch if the pointer exists and is currently null + *ptr = s; + } + } + } +}; + +#endif // NFA_HPP \ No newline at end of file diff --git a/libpz/include/NfaBuilder.hpp b/libpz/include/NfaBuilder.hpp new file mode 100644 index 0000000..7d640eb --- /dev/null +++ b/libpz/include/NfaBuilder.hpp @@ -0,0 +1,62 @@ +#ifndef NFA_BUILDER_HPP +#define NFA_BUILDER_HPP + +#include + +/** + * @brief Builds an ε-NFA from a postfix regex token sequence. + * + * Implements Thompson-style construction to convert postfix regex tokens + * into an NFA graph. All states created during construction are owned + * internally and cleaned up automatically. + */ +class NfaBuilder { +public: + /** + * @brief Build an NFA from a postfix regex. + * + * The resulting NFA has a single accepting state of type + * StateType::MATCH. The returned pointer refers to the start state. + * + * @param postfix Regex tokens in postfix (RPN) form. + * @return Pointer to the start state of the constructed NFA. + */ + State *build(const std::vector &postfix); + + /** + * @brief Create a deep copy of an NFA fragment. + * + * Used for handling quantifiers that require duplication of subgraphs + * (e.g. {m,n}, *, +). + */ + Frag copy_fragment(Frag); + + /** + * @brief Deep copy an NFA subgraph starting from a given state. + * + * Keeps a lookup map to avoid duplicating already-copied states. + * + * @param s Original state to copy. + * @param lookup Map from original states to their copies. + * @return Pointer to the copied state. + */ + State *copy_state(State *, std::unordered_map &); + +private: + /** + * @brief Allocate a new NFA state and store it in the internal pool. + * + * Ownership is retained by the builder to ensure correct lifetime. + */ + State *create_state(StateType type); + + /** + * @brief Owns all NFA states created during construction. + * + * Ensures that all State objects remain valid for the lifetime + * of the NfaBuilder and are automatically destroyed via RAII. + */ + std::vector> state_pool; +}; + +#endif // NFA_BUILDER_HPP \ No newline at end of file diff --git a/libpz/include/RegexPostfix.hpp b/libpz/include/RegexPostfix.hpp new file mode 100644 index 0000000..ce7a76d --- /dev/null +++ b/libpz/include/RegexPostfix.hpp @@ -0,0 +1,28 @@ +#ifndef REGEX_POSTFIX_HPP +#define REGEX_POSTFIX_HPP + +#include +#include +#include + +/** + * @brief Converts regex tokens from infix to postfix (RPN) form. + * + * This conversion is used as a preprocessing step before NFA construction. + * The class is stateless and intended to be used via its static methods. + */ +class Postfix { +public: + /** + * @brief Convert an infix token sequence into postfix order. + */ + static std::vector convert(const std::vector &infix); + +private: + /** + * @brief Returns precedence of a regex operator token. + */ + static st32 get_precedence(TokenType type); +}; + +#endif // REGEX_POSTFIX_HPP \ No newline at end of file diff --git a/libpz/include/RegexTokenizer.hpp b/libpz/include/RegexTokenizer.hpp new file mode 100644 index 0000000..8ca69ca --- /dev/null +++ b/libpz/include/RegexTokenizer.hpp @@ -0,0 +1,145 @@ +#ifndef REGEX_TOKENIZER_HPP +#define REGEX_TOKENIZER_HPP + +#include +#include + +/** + * @brief Types of tokens produced by the regex tokenizer. + */ +enum class TokenType { + /** Literal character like 'a', 'b', etc. */ + LITERAL, + + /** '.' wildcard */ + DOT, + + /** '*' operator */ + STAR, + + /** '+' operator */ + PLUS, + + /** '?' operator */ + QUESTION, + + /** '|' alternation */ + ALTERNATION, + + /** '(' opening group */ + LPAREN, + + /** ')' closing group */ + RPAREN, + + /** '^' start anchor */ + CARET, + + /** '$' end anchor */ + DOLLAR, + + /** Character class: '[...]', \d, \w, \s, etc */ + CHAR_CLASS, + + /** Quantifier range: '{m,n}', '{m,}', '{m}' */ + QUANTIFIER_RANGE, + + /** End of pattern */ + END, + + /** Implicit concatenation */ + CONCAT +}; + +/** + * @brief Represents a character range [lo, hi]. + */ +struct CharRange { + /** Lower bound */ + ut8 lo; + + /** Upper bound */ + ut8 hi; +}; + +/** + * @brief A single token in the regex. + */ +struct Token { + /** Token category */ + TokenType type; + /** Position in pattern (for error reporting) */ + size_t pos; + /** Group ID for parentheses */ + st32 group_id = -1; + + /** Literal character value */ + ut8 literal = '\0'; + + /** Whether character class is negated */ + bool negated = false; + /** Character ranges for character class */ + std::vector ranges{}; + + /** Minimum repetitions for quantifier */ + st32 min = 0; + /** Maximum repetitions (-1 means unbounded) */ + st32 max = 0; +}; + +/** + * @brief Converts a regex pattern into a sequence of tokens. + */ +class Tokenizer { +public: + /** + * @brief Construct tokenizer for a pattern. + * @param pat Regex pattern. + */ + explicit Tokenizer(std::string_view pat); + + /** + * @brief Tokenize the entire pattern. + * @return Vector of tokens ending with END token. + */ + std::vector tokenize(); + +private: + /** Input regex pattern */ + std::string_view pattern; + /** Current cursor position */ + size_t cursor_pos = 0; + /** Counter for assigning group IDs */ + st32 group_counter = 0; + /** Stack for nested group tracking */ + std::stack group_stack; + + /** Peek next character without consuming */ + ut8 peek() const; + /** Consume next character */ + ut8 get(); + /** Check for end of input */ + bool eof() const; + + /** Read next token */ + Token next_token(); + /** Read literal character */ + Token read_literal(ut8); + /** Read escape sequence */ + Token read_escape(); + /** Read character class */ + Token read_char_class(); + /** Read quantifier range */ + Token read_quantifier(); + + /** @brief Populates a token with ranges for \d, \w, \s, etc. */ + void add_shorthand_ranges(ut8, Token &); + + /** @brief Inserts implicit CONCAT tokens where concatenation occurs. */ + void add_concat_tokens(std::vector &); + + /** @brief Sorts and merges overlapping ranges for efficient NFA matching. */ + void normalize_ranges(std::vector &); +}; + +#endif // REGEX_TOKENIZER_HPP \ No newline at end of file diff --git a/libpz/include/pz_cxx_std.hpp b/libpz/include/pz_cxx_std.hpp index f4c0160..4b38066 100644 --- a/libpz/include/pz_cxx_std.hpp +++ b/libpz/include/pz_cxx_std.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/libpz/regex/NfaBuilder.cpp b/libpz/regex/NfaBuilder.cpp new file mode 100644 index 0000000..fb9285a --- /dev/null +++ b/libpz/regex/NfaBuilder.cpp @@ -0,0 +1,298 @@ +#include "NfaBuilder.hpp" +#include "pz_error.hpp" + +// Allocate a new NFA state, keep ownership in the builder(state pool), +// and return a raw pointer to the state. +State *NfaBuilder::create_state(StateType type) { + state_pool.push_back(std::make_unique(type)); + return state_pool.back().get(); +} + +// Create a deep copy of an NFA fragment. +// All states are duplicated except MATCH states, which are shared. +Frag NfaBuilder::copy_fragment(Frag original) { + std::unordered_map + old_to_new; // stores the states we have already visited and its cloned + // copies + State *new_start = copy_state(original.start, old_to_new); + + std::vector new_exits; + + // Traverse copied graph to collect dangling exits + std::unordered_set + visited; // Remember which states have been already visited + std::stack s; + s.push(new_start); + while (!s.empty()) // Loop until there are no more states left to process + { + State *curr = s.top(); + s.pop(); + if (!curr || visited.count(curr)) + continue; + visited.insert(curr); + // If out is null, it's a dangling exit we need to patch later (Unpatched + // primary exit) + if (!curr->out && curr->type != StateType::MATCH) { + new_exits.push_back(&curr->out); + } + // If out1 is null (and it's a SPLIT state), it's also an exit (Unpatched + // secondary exit for SPLIT states) + if (!curr->out1 && curr->type == StateType::SPLIT) { + new_exits.push_back(&curr->out1); + } + + if (curr->out) + s.push(curr->out); + if (curr->out1) + s.push(curr->out1); + } + + return Frag(new_start, new_exits); +} + +// Recursively clone an NFA subgraph starting from state 's'. +// The 'lookup' map ensures that each original state is copied exactly once. +// This preserves shared structure and prevents infinite recursion on cycles. +// MATCH states are not duplicated: a copied fragment always reconnects to +// the same final MATCH state during patching. +State *NfaBuilder::copy_state(State *s, + std::unordered_map &lookup) { + + // Null state or final MATCH state: return as-is + if (!s || s->type == StateType::MATCH) + return s; + + // If this state was already copied, reuse the existing clone + if (lookup.count(s)) + return lookup[s]; + + // Create a new state with the same semantic properties + State *result = create_state(s->type); + result->c = s->c; + result->ranges = s->ranges; + result->negated = s->negated; + result->save_id = s->save_id; + + // Record the mapping before recursing to handle cycles correctly + lookup[s] = result; + + // Recursively copy outgoing transitions + result->out = copy_state(s->out, lookup); + result->out1 = copy_state(s->out1, lookup); + return result; +} + +// Build an ε-NFA from a postfix (RPN) regex token sequence. +// The algorithm processes tokens left-to-right, maintaining a stack of +// NFA fragments. Each operator combines or transforms fragments according +// to standard Thompson construction rules. At the end, all dangling exits +// are patched to a single MATCH state. +State *NfaBuilder::build(const std::vector &postfix) { + std::stack stack; + + for (const auto &t : postfix) { + switch (t.type) { + + // Atomic expressions: + + case TokenType::LITERAL: { + State *s = create_state(StateType::CHAR); + s->c = t.literal; + stack.push(Frag(s)); + break; + } + case TokenType::DOT: { + stack.push(Frag(create_state(StateType::DOT))); + break; + } + case TokenType::CHAR_CLASS: { + State *s = create_state(StateType::CHAR_CLASS); + s->ranges = t.ranges; + s->negated = t.negated; + stack.push(Frag(s)); + break; + } + case TokenType::CARET: { + stack.push(Frag(create_state(StateType::ANCHOR_START))); + break; + } + case TokenType::DOLLAR: { + stack.push(Frag(create_state(StateType::ANCHOR_END))); + break; + } + + // Capture groups: + + case TokenType::LPAREN: { + State *s = create_state(StateType::SAVE); + s->save_id = t.group_id * 2; // capture start (even) + stack.push(Frag(s)); + break; + } + case TokenType::RPAREN: { + // Create the save (end) state + State *s = create_state(StateType::SAVE); + s->save_id = t.group_id * 2 + 1; // capture end (odd) + + // Extract the content of the group along with save (start) + Frag content = stack.top(); + stack.pop(); + Frag lparen_frag = stack.top(); + stack.pop(); + lparen_frag.patch(content.start); + content.patch(s); + + // Push the whole fragment + stack.push(Frag(lparen_frag.start, {&s->out})); + break; + } + + // Binary operators: + + case TokenType::CONCAT: { + Frag e2 = stack.top(); + stack.pop(); + Frag e1 = stack.top(); + stack.pop(); + e1.patch(e2.start); + stack.push(Frag(e1.start, e2.out_ptrs)); + break; + } + case TokenType::ALTERNATION: { + Frag e2 = stack.top(); + stack.pop(); + Frag e1 = stack.top(); + stack.pop(); + State *s = create_state(StateType::SPLIT); + s->out = e1.start; + s->out1 = e2.start; + // Combine dangling exits from both branches + std::vector combined = e1.out_ptrs; + combined.insert(combined.end(), e2.out_ptrs.begin(), e2.out_ptrs.end()); + stack.push(Frag(s, combined)); + break; + } + + // Unary operators: + + case TokenType::STAR: { + Frag e = stack.top(); + stack.pop(); + State *s = create_state(StateType::SPLIT); + s->out = e.start; // Loop back into the expression + e.patch(s); // The expression's end loops back to the split + stack.push(Frag(s, {&s->out1})); // out1 is the escape route + break; + } + case TokenType::PLUS: { + Frag e = stack.top(); + stack.pop(); + State *s = create_state(StateType::SPLIT); + s->out = e.start; // Loop back + e.patch(s); // Connect expression end to split + stack.push(Frag(e.start, {&s->out1})); + break; + } + case TokenType::QUESTION: { + Frag e = stack.top(); + stack.pop(); + State *s = create_state(StateType::SPLIT); + s->out = e.start; // Option 1: match the expression + // Option 2: skip the expression (out1) + std::vector exits = e.out_ptrs; + exits.push_back(&s->out1); + stack.push(Frag(s, exits)); + break; + } + + // Bounded repetition: + + case TokenType::QUANTIFIER_RANGE: { + Frag e = stack.top(); + stack.pop(); + + // i) Handle the mandatory part (m) + // Initialize 'mandatory' with an immediately-invoked lambda (no valid + // default state). + Frag mandatory = [&]() { + if (t.min == 0) { + State *eps = create_state(StateType::SPLIT); + return Frag(eps, {&eps->out}); + } else { + return copy_fragment(e); // Use the first one as the base + } + }(); + // If min > 1, append the necessary copies + for (int i = 1; i < t.min; i++) { + Frag next_copy = copy_fragment(e); + mandatory.patch(next_copy.start); + mandatory = Frag(mandatory.start, next_copy.out_ptrs); + } + + // ii) Handle the optional part (n - m) or infinite (m, ) + if (t.max == -1) { // {m,} + State *s = create_state(StateType::SPLIT); + Frag loop_part = copy_fragment(e); + + s->out = loop_part.start; + loop_part.patch(s); + + mandatory.patch(s); + stack.push(Frag(mandatory.start, {&s->out1})); + } else if (t.max > t.min) { // {m,n} + // Build a chain of optional fragments, each one guarded by a SPLIT that + // can either take the repetition or skip it and move on + Frag optional_chain = mandatory; + std::vector all_exits; + + for (int i = 0; i < (t.max - t.min); i++) { + Frag next_opt = copy_fragment(e); + State *s = create_state(StateType::SPLIT); + + s->out = next_opt.start; + optional_chain.patch(s); + + // Collect exits from the skip path + all_exits.push_back(&s->out1); + + optional_chain = Frag(next_opt.start, next_opt.out_ptrs); + } + // Add exits from the last repetition: if all optional parts are taken, + // the match can continue after the final copied fragment. + all_exits.insert(all_exits.end(), optional_chain.out_ptrs.begin(), + optional_chain.out_ptrs.end()); + stack.push(Frag(mandatory.start, all_exits)); + } else { // {m} + stack.push(mandatory); + } + break; + } + default: + break; + } + } + + // Empty regex produces an ε-NFA (No fragments) + if (stack.empty()) { + State *s = create_state(StateType::SPLIT); + stack.push(Frag(s)); + } + + // Implicit concatenation of remaining fragments + while (stack.size() > 1) { + Frag e2 = stack.top(); + stack.pop(); + Frag e1 = stack.top(); + stack.pop(); + e1.patch(e2.start); + stack.push(Frag(e1.start, e2.out_ptrs)); + } + + // Patch all remaining exits to the final MATCH state + Frag final_frag = stack.top(); + stack.pop(); + State *match_state = create_state(StateType::MATCH); + final_frag.patch(match_state); + + return final_frag.start; +} \ No newline at end of file diff --git a/libpz/regex/RegexPostfix.cpp b/libpz/regex/RegexPostfix.cpp new file mode 100644 index 0000000..9c51a83 --- /dev/null +++ b/libpz/regex/RegexPostfix.cpp @@ -0,0 +1,126 @@ +#include "RegexPostfix.hpp" +#include "pz_error.hpp" + +st32 Postfix::get_precedence(TokenType type) { + switch (type) { + case TokenType::STAR: + case TokenType::PLUS: + case TokenType::QUESTION: + case TokenType::QUANTIFIER_RANGE: + return 3; // Unary postfix operators + case TokenType::CONCAT: + return 2; // Implicit concatenation + case TokenType::ALTERNATION: + return 1; // Lowest precedence + default: + return 0; + } +} + +std::vector Postfix::convert(const std::vector &infix) { + std::vector postfix; + std::stack operators; + TokenType last_type = TokenType::END; // Tracks previous token for validation + + for (const auto &t : infix) { + switch (t.type) { + // Operands go directly to output + case TokenType::LITERAL: + case TokenType::DOT: + case TokenType::CHAR_CLASS: + case TokenType::CARET: + case TokenType::DOLLAR: + postfix.push_back(t); + break; + + // '(' is pushed to operator stack and output (for NFA grouping) + case TokenType::LPAREN: { + postfix.push_back(t); + operators.push(t); + break; + } + + // Pop operators until matching '(' is found + case TokenType::RPAREN: { + if (last_type == TokenType::LPAREN) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Empty Parentheses at position " + + std::to_string(t.pos)); + while (!operators.empty() && operators.top().type != TokenType::LPAREN) { + postfix.push_back(operators.top()); + operators.pop(); + } + if (operators.empty()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Mismatched ')' at position " + + std::to_string(t.pos)); + operators.pop(); // Discard '(' + postfix.push_back(t); + break; + } + // Unary postfix operators must follow a valid expression + case TokenType::STAR: + case TokenType::PLUS: + case TokenType::QUESTION: + case TokenType::QUANTIFIER_RANGE: + if (last_type != TokenType::LITERAL && last_type != TokenType::DOT && + last_type != TokenType::CHAR_CLASS && + last_type != TokenType::RPAREN) { + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Quantifier used without a valid preceding " + "expression at position " + + std::to_string(t.pos)); + } + postfix.push_back(t); + break; + + case TokenType::ALTERNATION: + // '|' must separate two valid expressions + if (last_type == TokenType::END || last_type == TokenType::LPAREN || + last_type == TokenType::ALTERNATION) { + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Invalid '|' at position " + + std::to_string(t.pos) + + ". It must separate two expressions."); + } + goto push_operator; + + // Binary operators handled via precedence rules + case TokenType::CONCAT: + push_operator: + while (!operators.empty() && operators.top().type != TokenType::LPAREN && + get_precedence(operators.top().type) >= get_precedence(t.type)) { + postfix.push_back(operators.top()); + operators.pop(); + } + operators.push(t); + break; + + default: + break; + } + + if (t.type != TokenType::END) + last_type = t.type; + } + + // Pattern must not end with a binary operator + if (last_type == TokenType::ALTERNATION || last_type == TokenType::CONCAT) { + PzError::report_error( + PzError::PzErrorType::PZ_INVALID_INPUT, + "Trailing binary operator at end of pattern at position " + + std::to_string(infix.back().pos)); + } + + // Drain remaining operators + while (!operators.empty()) { + if (operators.top().type == TokenType::LPAREN) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Unmatched '(' at position " + + std::to_string(operators.top().pos)); + postfix.push_back(operators.top()); + operators.pop(); + } + + return postfix; +} \ No newline at end of file diff --git a/libpz/regex/RegexTokenizer.cpp b/libpz/regex/RegexTokenizer.cpp new file mode 100644 index 0000000..5e9b3c2 --- /dev/null +++ b/libpz/regex/RegexTokenizer.cpp @@ -0,0 +1,433 @@ +#include "RegexTokenizer.hpp" +#include "pz_error.hpp" + +Tokenizer::Tokenizer(std::string_view pat) : pattern(pat) {} + +ut8 Tokenizer::peek() const { return eof() ? '\0' : pattern[cursor_pos]; } + +ut8 Tokenizer::get() { return eof() ? '\0' : pattern[cursor_pos++]; } + +bool Tokenizer::eof() const { return cursor_pos >= pattern.size(); } + +std::vector Tokenizer::tokenize() { + std::vector tokens; + while (!eof()) { + tokens.push_back(next_token()); + } + tokens.push_back(Token{TokenType::END, cursor_pos}); + add_concat_tokens(tokens); + return tokens; +} + +void Tokenizer::add_concat_tokens(std::vector &tokens) { + if (tokens.size() <= 2) + return; + + std::vector normalized; + normalized.reserve(tokens.size() * 2); + + for (size_t idx = 0; idx < tokens.size(); idx++) { + normalized.push_back(std::move(tokens[idx])); + + if (idx + 1 >= tokens.size()) + break; + + const Token ¤t = normalized.back(); + const Token &next = tokens[idx + 1]; + + // Can the current token be the left side of a concatenation? + bool is_ender = + (current.type == TokenType::LITERAL || current.type == TokenType::DOT || + current.type == TokenType::CHAR_CLASS || + current.type == TokenType::RPAREN || current.type == TokenType::STAR || + current.type == TokenType::PLUS || + current.type == TokenType::QUESTION || + current.type == TokenType::QUANTIFIER_RANGE || + current.type == TokenType::CARET); + + bool is_starter = + (next.type == TokenType::LITERAL || next.type == TokenType::DOT || + next.type == TokenType::LPAREN || next.type == TokenType::CHAR_CLASS || + next.type == TokenType::DOLLAR); + + if (is_ender && is_starter) { + Token concat; + concat.type = TokenType::CONCAT; + concat.pos = current.pos; + normalized.push_back(concat); + } + } + + tokens = std::move(normalized); +} + +Token Tokenizer::next_token() { + ut8 c = get(); + + // Position of the character that produced this token + size_t pos = cursor_pos - 1; + + switch (c) { + case '.': + return {TokenType::DOT, pos}; + case '*': + return {TokenType::STAR, pos}; + case '+': + return {TokenType::PLUS, pos}; + case '?': + return {TokenType::QUESTION, pos}; + case '|': + return {TokenType::ALTERNATION, pos}; + case '(': { + st32 id = ++group_counter; + group_stack.push(id); + Token t{TokenType::LPAREN, pos}; + t.group_id = id; + return t; + } + case ')': { + if (group_stack.empty()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Mismatched ')' at position " + + std::to_string(pos)); + st32 id = group_stack.top(); + group_stack.pop(); + Token t{TokenType::RPAREN, pos}; + t.group_id = id; + return t; + } + case '^': + return {TokenType::CARET, pos}; + case '$': + return {TokenType::DOLLAR, pos}; + case '\\': + return read_escape(); + case '[': + return read_char_class(); + case '{': + return read_quantifier(); + default: + return read_literal(c); + } +} + +Token Tokenizer::read_literal(ut8 c) { + Token t{TokenType::LITERAL, cursor_pos - 1}; + t.literal = c; + return t; +} + +Token Tokenizer::read_escape() { + if (eof()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Dangling escape at end of input"); + + Token t; + t.pos = cursor_pos - 1; + ut8 c = get(); + + if (c == 'd' || c == 'D' || c == 'w' || c == 'W' || c == 's' || c == 'S') { + t.type = TokenType::CHAR_CLASS; + add_shorthand_ranges(c, t); + return t; + } + + t.type = TokenType::LITERAL; + switch (c) { + case 'n': + t.literal = '\n'; + break; + case 't': + t.literal = '\t'; + break; + case 'r': + t.literal = '\r'; + break; + case 'f': + t.literal = '\f'; + break; + case 'v': + t.literal = '\v'; + break; + default: + t.literal = c; + break; + } + return t; +} + +void Tokenizer::add_shorthand_ranges(ut8 c, Token &t) { + static constexpr ut8 MIN_CHAR = 0; // ascii index 0 + static constexpr ut8 MAX_CHAR = ASCII_MAX; // ascii index 127 + switch (c) { + case 'd': + t.ranges.push_back({48, 57}); // '0' - '9' + break; + case 'D': + t.ranges.insert(t.ranges.end(), { + {MIN_CHAR, 47}, // Everything before '0' + {58, MAX_CHAR} // Everything after '9' + }); + break; + case 'w': + t.ranges.insert( + t.ranges.end(), + {{97, 122}, {65, 90}, {48, 57}, {95, 95}}); // a-z, A-Z, 0-9, _ + break; + case 'W': + t.ranges.insert(t.ranges.end(), { + {MIN_CHAR, 47}, // Before '0' + {58, 64}, // Between '9' and 'A' + {91, 94}, // Between 'Z' and '_' + {96, 96}, // Between '_' and 'a' + {123, MAX_CHAR} // After 'z' + }); + break; + case 's': + t.ranges.insert(t.ranges.end(), {{32, 32}, // Space + {9, 13}} // \t, \n, \v, \f, \r + ); + break; + + case 'S': + t.ranges.insert(t.ranges.end(), { + {MIN_CHAR, 8}, // Before \t + {14, 31}, // Between \r and Space + {33, MAX_CHAR} // After Space + }); + break; + } +} + +void Tokenizer::normalize_ranges(std::vector &ranges) { + if (ranges.empty()) + return; + + std::sort(ranges.begin(), ranges.end(), + [](const CharRange &a, const CharRange &b) { + if (a.lo != b.lo) + return a.lo < b.lo; + return a.hi < b.hi; + }); + + size_t write = 0; + + for (size_t read = 1; read < ranges.size(); ++read) { + CharRange &last = ranges[write]; + const CharRange &cur = ranges[read]; + + if (cur.lo <= last.hi + 1) { + // merge into last + last.hi = std::max(last.hi, cur.hi); + } else { + // move cur to next write position + ++write; + ranges[write] = cur; + } + } + + ranges.resize(write + 1); +} + +Token Tokenizer::read_char_class() { + Token t{TokenType::CHAR_CLASS, cursor_pos - 1}; + if (peek() == '^') { + t.negated = true; + get(); + } + + bool have_prev = false; // pending character for range + bool last_was_shorthand = false; // whether last token was \d, \w, etc. + ut8 prev; + + // Read until closing ']' + while (!eof() && peek() != ']') { + ut8 c = get(); + if (c == '\\') // Handle escape sequences + { + if (eof()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Dangling escape in char class at position " + + std::to_string(cursor_pos)); + // Flush pending literal before escape + if (have_prev) { + t.ranges.push_back({prev, prev}); + have_prev = false; + } + c = get(); + switch (c) { + // Common escaped control characters + case 'n': + prev = '\n'; + have_prev = true; + last_was_shorthand = false; + break; + case 't': + prev = '\t'; + have_prev = true; + last_was_shorthand = false; + break; + case 'r': + prev = '\r'; + have_prev = true; + last_was_shorthand = false; + break; + case 'f': + prev = '\f'; + have_prev = true; + last_was_shorthand = false; + break; + case 'v': + prev = '\v'; + have_prev = true; + last_was_shorthand = false; + break; + + // Shorthand character classes + case 'd': + case 'w': + case 's': + case 'D': + case 'W': + case 'S': { + add_shorthand_ranges(c, t); + last_was_shorthand = true; + break; + } + + // Escaped literal characters + default: { + prev = c; + have_prev = true; + last_was_shorthand = false; + break; + } + } + continue; + } + + // Handle range syntax: + if (have_prev && c == '-' && + peek() != ']') { // when '-' acts as a range specifier + ut8 ub = get(); + if (ub == '\\') // Handle escaped upper bound + { + if (eof()) + PzError::report_error( + PzError::PzErrorType::PZ_INVALID_INPUT, + "Dangling escape in character range at position " + + std::to_string(cursor_pos)); + ub = get(); + if (ub == 'd' || ub == 'D' || ub == 'w' || ub == 'W' || ub == 's' || + ub == 'S') { + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Cannot create a range with shorthand escape " + "sequences at position " + + std::to_string(cursor_pos - 1)); + } + } + if (prev > ub) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Invalid character range at position " + + std::to_string(cursor_pos - 1)); + t.ranges.push_back({prev, ub}); + have_prev = false; + continue; + } + if (c == '-' && last_was_shorthand && peek() != ']') { + PzError::report_error( + PzError::PzErrorType::PZ_INVALID_INPUT, + "Cannot create a range with shorthand escape sequences at position " + + std::to_string(cursor_pos - 1)); + } + + // Flush pending literal if no range follows + if (have_prev) + t.ranges.push_back({prev, prev}); + + prev = c; + have_prev = true; + last_was_shorthand = false; + } + + // Missing closing ']' + if (eof()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Unterminated character class starting at position " + + std::to_string(t.pos)); + if (have_prev) + t.ranges.push_back({prev, prev}); // Flush last pending character + if (t.ranges.empty()) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Empty char class starting at position " + + std::to_string(t.pos)); // Disallow empty classes + get(); // consume ']' + normalize_ranges(t.ranges); + return t; +} +// NOTE: []] will be treated as an empty character class followed by a ] literal +// In many regex implementations, it gets processed as a valid char class with +// literal ']' but we currently treat the earliest found ] as the end of the +// char class as a design choice. To use ] as a literal inside the char class, +// user needs to escape it. + +Token Tokenizer::read_quantifier() { + // Position of '{' is stored in t.pos for error reporting + Token t{TokenType::QUANTIFIER_RANGE, cursor_pos - 1}; + + auto skip_spaces = [&]() { + while (!eof() && std::isspace(peek())) { + get(); + } + }; + + auto read_int = [&]() -> st32 { + skip_spaces(); + st32 val = 0; + bool found = false; + while (!eof() && std::isdigit(peek())) { + found = true; + val = val * 10 + (get() - '0'); + } + if (!found && peek() != ',') + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Expected number in quantifier at position " + + std::to_string(t.pos)); + skip_spaces(); + return val; + }; + + t.min = read_int(); + + if (peek() == '}') { + get(); + t.max = t.min; + return t; + } + + if (peek() != ',') + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Invalid quantifier syntax at position " + + std::to_string(t.pos)); + get(); + skip_spaces(); + + if (peek() == '}') { + get(); + t.max = -1; + return t; + } + + t.max = read_int(); + if (peek() != '}') + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Invalid quantifier syntax at position " + + std::to_string(t.pos)); + get(); + + if (t.max != -1 && t.max < t.min) + PzError::report_error(PzError::PzErrorType::PZ_INVALID_INPUT, + "Invalid quantifier range at position " + + std::to_string(t.pos)); + return t; +} \ No newline at end of file