Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions libpz/include/Nfa.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#ifndef NFA_HPP
#define NFA_HPP

#include <RegexTokenizer.hpp>
#include <pz_cxx_std.hpp>
#include <pz_types.hpp>

/**
* @brief Types of NFA states used in regex matching.
*/
enum class StateType {
/** Match a single literal character */
CHAR,

/** Match any character (.) */
DOT,

/** Match a character class ([...]) */
CHAR_CLASS,

/** Accepting (final) state */
MATCH,

/** ε-transition with two outgoing branches */
SPLIT,

/** Save input position (for capture groups) */
SAVE,

/** Start-of-input anchor (^) */
ANCHOR_START,

/** End-of-input anchor ($) */
ANCHOR_END
};

/**
* @brief Represents a single state in the NFA.
*/
struct State {
StateType type;

/** Literal character to match (valid only for CHAR states, unspecified
* otherwise). */
ut8 c;

/** Capture group identifier (used by SAVE states to store input positions).
*/
st32 save_id = -1;
// Even IDs represent group start, odd IDs represent group end.

/** Character ranges for CHAR_CLASS states. */
std::vector<CharRange> ranges;
bool negated = false;

/** Primary outgoing transition. */
State *out = nullptr;

/** Secondary outgoing transition (used only by SPLIT states). */
State *out1 = nullptr;

/**
* @brief Marker used during NFA simulation.
*
* Prevents revisiting the same state multiple times in a single step,
* avoiding duplicate work and infinite ε-transition loops.
*/
st32 last_list = -1;
// Marks whether this state has already been added to the current
// active-states list, preventing duplicate entries and infinite ε-transition
// loops

State(StateType t) : type(t) {}
};

/**
* @brief Represents a partially constructed NFA fragment.
*
* A fragment consists of:
* - a start state
* - a list of dangling outgoing transitions that must be patched later
*/
struct Frag {
State *start;

/** Addresses of state pointers that need to be connected later. */
std::vector<State **> out_ptrs;

/**
* @brief Construct a fragment with a single dangling exit.
*/
Frag(State *s) : start(s) { out_ptrs.push_back(&s->out); }

/**
* @brief Construct a fragment with multiple dangling exits.
*/
Frag(State *s, std::vector<State **> out) : start(s), out_ptrs(out) {}

/**
* @brief Patch all dangling exits to point to the given state.
*/
void patch(State *s) {
for (auto &ptr : out_ptrs) {
if (ptr &&
!*ptr) { // Only patch if the pointer exists and is currently null
*ptr = s;
}
}
}
};

#endif // NFA_HPP
62 changes: 62 additions & 0 deletions libpz/include/NfaBuilder.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#ifndef NFA_BUILDER_HPP
#define NFA_BUILDER_HPP

#include <Nfa.hpp>

/**
* @brief Builds an ε-NFA from a postfix regex token sequence.
*
* Implements Thompson-style construction to convert postfix regex tokens
* into an NFA graph. All states created during construction are owned
* internally and cleaned up automatically.
*/
class NfaBuilder {
public:
/**
* @brief Build an NFA from a postfix regex.
*
* The resulting NFA has a single accepting state of type
* StateType::MATCH. The returned pointer refers to the start state.
*
* @param postfix Regex tokens in postfix (RPN) form.
* @return Pointer to the start state of the constructed NFA.
*/
State *build(const std::vector<Token> &postfix);

/**
* @brief Create a deep copy of an NFA fragment.
*
* Used for handling quantifiers that require duplication of subgraphs
* (e.g. {m,n}, *, +).
*/
Frag copy_fragment(Frag);

/**
* @brief Deep copy an NFA subgraph starting from a given state.
*
* Keeps a lookup map to avoid duplicating already-copied states.
*
* @param s Original state to copy.
* @param lookup Map from original states to their copies.
* @return Pointer to the copied state.
*/
State *copy_state(State *, std::unordered_map<State *, State *> &);

private:
/**
* @brief Allocate a new NFA state and store it in the internal pool.
*
* Ownership is retained by the builder to ensure correct lifetime.
*/
State *create_state(StateType type);

/**
* @brief Owns all NFA states created during construction.
*
* Ensures that all State objects remain valid for the lifetime
* of the NfaBuilder and are automatically destroyed via RAII.
*/
std::vector<std::unique_ptr<State>> state_pool;
};

#endif // NFA_BUILDER_HPP
28 changes: 28 additions & 0 deletions libpz/include/RegexPostfix.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef REGEX_POSTFIX_HPP
#define REGEX_POSTFIX_HPP

#include <RegexTokenizer.hpp>
#include <pz_cxx_std.hpp>
#include <pz_types.hpp>

/**
* @brief Converts regex tokens from infix to postfix (RPN) form.
*
* This conversion is used as a preprocessing step before NFA construction.
* The class is stateless and intended to be used via its static methods.
*/
class Postfix {
public:
/**
* @brief Convert an infix token sequence into postfix order.
*/
static std::vector<Token> convert(const std::vector<Token> &infix);

private:
/**
* @brief Returns precedence of a regex operator token.
*/
static st32 get_precedence(TokenType type);
};

#endif // REGEX_POSTFIX_HPP
145 changes: 145 additions & 0 deletions libpz/include/RegexTokenizer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#ifndef REGEX_TOKENIZER_HPP
#define REGEX_TOKENIZER_HPP

#include <pz_cxx_std.hpp>
#include <pz_types.hpp>

/**
* @brief Types of tokens produced by the regex tokenizer.
*/
enum class TokenType {
/** Literal character like 'a', 'b', etc. */
LITERAL,

/** '.' wildcard */
DOT,

/** '*' operator */
STAR,

/** '+' operator */
PLUS,

/** '?' operator */
QUESTION,

/** '|' alternation */
ALTERNATION,

/** '(' opening group */
LPAREN,

/** ')' closing group */
RPAREN,

/** '^' start anchor */
CARET,

/** '$' end anchor */
DOLLAR,

/** Character class: '[...]', \d, \w, \s, etc */
CHAR_CLASS,

/** Quantifier range: '{m,n}', '{m,}', '{m}' */
QUANTIFIER_RANGE,

/** End of pattern */
END,

/** Implicit concatenation */
CONCAT
};

/**
* @brief Represents a character range [lo, hi].
*/
struct CharRange {
/** Lower bound */
ut8 lo;

/** Upper bound */
ut8 hi;
};

/**
* @brief A single token in the regex.
*/
struct Token {
/** Token category */
TokenType type;
/** Position in pattern (for error reporting) */
size_t pos;
/** Group ID for parentheses */
st32 group_id = -1;

/** Literal character value */
ut8 literal = '\0';

/** Whether character class is negated */
bool negated = false;
/** Character ranges for character class */
std::vector<CharRange> ranges{};

/** Minimum repetitions for quantifier */
st32 min = 0;
/** Maximum repetitions (-1 means unbounded) */
st32 max = 0;
};

/**
* @brief Converts a regex pattern into a sequence of tokens.
*/
class Tokenizer {
public:
/**
* @brief Construct tokenizer for a pattern.
* @param pat Regex pattern.
*/
explicit Tokenizer(std::string_view pat);

/**
* @brief Tokenize the entire pattern.
* @return Vector of tokens ending with END token.
*/
std::vector<Token> tokenize();

private:
/** Input regex pattern */
std::string_view pattern;
/** Current cursor position */
size_t cursor_pos = 0;
/** Counter for assigning group IDs */
st32 group_counter = 0;
/** Stack for nested group tracking */
std::stack<st32> group_stack;

/** Peek next character without consuming */
ut8 peek() const;
/** Consume next character */
ut8 get();
/** Check for end of input */
bool eof() const;

/** Read next token */
Token next_token();
/** Read literal character */
Token read_literal(ut8);
/** Read escape sequence */
Token read_escape();
/** Read character class */
Token read_char_class();
/** Read quantifier range */
Token read_quantifier();

/** @brief Populates a token with ranges for \d, \w, \s, etc. */
void add_shorthand_ranges(ut8, Token &);

/** @brief Inserts implicit CONCAT tokens where concatenation occurs. */
void add_concat_tokens(std::vector<Token> &);

/** @brief Sorts and merges overlapping ranges for efficient NFA matching. */
void normalize_ranges(std::vector<CharRange> &);
};

#endif // REGEX_TOKENIZER_HPP
1 change: 1 addition & 0 deletions libpz/include/pz_cxx_std.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <optional>
#include <set>
#include <sstream>
#include <stack>
#include <string>
#include <string_view>
#include <unordered_map>
Expand Down
Loading