Programming-Club-Org · Ovetsarilish · Feb 8, 2026 · Jan 26, 2026 · Jan 27, 2026 · Jan 30, 2026
diff --git a/libpz/include/Nfa.hpp b/libpz/include/Nfa.hpp
@@ -0,0 +1,112 @@
+#ifndef NFA_HPP
+#define NFA_HPP
+
+#include <RegexTokenizer.hpp>
+#include <pz_cxx_std.hpp>
+#include <pz_types.hpp>
+
+/**
+ * @brief Types of NFA states used in regex matching.
+ */
+enum class StateType {
+  /** Match a single literal character */
+  CHAR,
+
+  /** Match any character (.) */
+  DOT,
+
+  /** Match a character class ([...]) */
+  CHAR_CLASS,
+
+  /** Accepting (final) state */
+  MATCH,
+
+  /** ε-transition with two outgoing branches */
+  SPLIT,
+
+  /** Save input position (for capture groups) */
+  SAVE,
+
+  /** Start-of-input anchor (^) */
+  ANCHOR_START,
+
+  /** End-of-input anchor ($) */
+  ANCHOR_END
+};
+
+/**
+ * @brief Represents a single state in the NFA.
+ */
+struct State {
+  StateType type;
+
+  /** Literal character to match (valid only for CHAR states, unspecified
+   * otherwise). */
+  ut8 c;
+
+  /** Capture group identifier (used by SAVE states to store input positions).
+   */
+  st32 save_id = -1;
+  // Even IDs represent group start, odd IDs represent group end.
+
+  /** Character ranges for CHAR_CLASS states. */
+  std::vector<CharRange> ranges;
+  bool negated = false;
+
+  /** Primary outgoing transition. */
+  State *out = nullptr;
+
+  /** Secondary outgoing transition (used only by SPLIT states). */
+  State *out1 = nullptr;
+
+  /**
+   * @brief Marker used during NFA simulation.
+   *
+   * Prevents revisiting the same state multiple times in a single step,
+   * avoiding duplicate work and infinite ε-transition loops.
+   */
+  st32 last_list = -1;
+  // Marks whether this state has already been added to the current
+  // active-states list, preventing duplicate entries and infinite ε-transition
+  // loops
+
+  State(StateType t) : type(t) {}
+};
+
+/**
+ * @brief Represents a partially constructed NFA fragment.
+ *
+ * A fragment consists of:
+ *  - a start state
+ *  - a list of dangling outgoing transitions that must be patched later
+ */
+struct Frag {
+  State *start;
+
+  /** Addresses of state pointers that need to be connected later. */
+  std::vector<State **> out_ptrs;
+
+  /**
+   * @brief Construct a fragment with a single dangling exit.
+   */
+  Frag(State *s) : start(s) { out_ptrs.push_back(&s->out); }
+
+  /**
+   * @brief Construct a fragment with multiple dangling exits.
+   */
+  Frag(State *s, std::vector<State **> out) : start(s), out_ptrs(out) {}
+
+  /**
+   * @brief Patch all dangling exits to point to the given state.
+   */
+  void patch(State *s) {
+    for (auto &ptr : out_ptrs) {
+      if (ptr &&
+          !*ptr) { // Only patch if the pointer exists and is currently null
+        *ptr = s;
+      }
+    }
+  }
+};
+
+#endif // NFA_HPP
diff --git a/libpz/include/NfaBuilder.hpp b/libpz/include/NfaBuilder.hpp
@@ -0,0 +1,62 @@
+#ifndef NFA_BUILDER_HPP
+#define NFA_BUILDER_HPP
+
+#include <Nfa.hpp>
+
+/**
+ * @brief Builds an ε-NFA from a postfix regex token sequence.
+ *
+ * Implements Thompson-style construction to convert postfix regex tokens
+ * into an NFA graph. All states created during construction are owned
+ * internally and cleaned up automatically.
+ */
+class NfaBuilder {
+public:
+  /**
+   * @brief Build an NFA from a postfix regex.
+   *
+   * The resulting NFA has a single accepting state of type
+   * StateType::MATCH. The returned pointer refers to the start state.
+   *
+   * @param postfix Regex tokens in postfix (RPN) form.
+   * @return Pointer to the start state of the constructed NFA.
+   */
+  State *build(const std::vector<Token> &postfix);
+
+  /**
+   * @brief Create a deep copy of an NFA fragment.
+   *
+   * Used for handling quantifiers that require duplication of subgraphs
+   * (e.g. {m,n}, *, +).
+   */
+  Frag copy_fragment(Frag);
+
+  /**
+   * @brief Deep copy an NFA subgraph starting from a given state.
+   *
+   * Keeps a lookup map to avoid duplicating already-copied states.
+   *
+   * @param s Original state to copy.
+   * @param lookup Map from original states to their copies.
+   * @return Pointer to the copied state.
+   */
+  State *copy_state(State *, std::unordered_map<State *, State *> &);
+
+private:
+  /**
+   * @brief Allocate a new NFA state and store it in the internal pool.
+   *
+   * Ownership is retained by the builder to ensure correct lifetime.
+   */
+  State *create_state(StateType type);
+
+  /**
+   * @brief Owns all NFA states created during construction.
+   *
+   * Ensures that all State objects remain valid for the lifetime
+   * of the NfaBuilder and are automatically destroyed via RAII.
+   */
+  std::vector<std::unique_ptr<State>> state_pool;
+};
+
+#endif // NFA_BUILDER_HPP
diff --git a/libpz/include/RegexPostfix.hpp b/libpz/include/RegexPostfix.hpp
@@ -0,0 +1,28 @@
+#ifndef REGEX_POSTFIX_HPP
+#define REGEX_POSTFIX_HPP
+
+#include <RegexTokenizer.hpp>
+#include <pz_cxx_std.hpp>
+#include <pz_types.hpp>
+
+/**
+ * @brief Converts regex tokens from infix to postfix (RPN) form.
+ *
+ * This conversion is used as a preprocessing step before NFA construction.
+ * The class is stateless and intended to be used via its static methods.
+ */
+class Postfix {
+public:
+  /**
+   * @brief Convert an infix token sequence into postfix order.
+   */
+  static std::vector<Token> convert(const std::vector<Token> &infix);
+
+private:
+  /**
+   * @brief Returns precedence of a regex operator token.
+   */
+  static st32 get_precedence(TokenType type);
+};
+
+#endif // REGEX_POSTFIX_HPP
diff --git a/libpz/include/RegexTokenizer.hpp b/libpz/include/RegexTokenizer.hpp
@@ -0,0 +1,145 @@
+#ifndef REGEX_TOKENIZER_HPP
+#define REGEX_TOKENIZER_HPP
+
+#include <pz_cxx_std.hpp>
+#include <pz_types.hpp>
+
+/**
+ * @brief Types of tokens produced by the regex tokenizer.
+ */
+enum class TokenType {
+  /** Literal character like 'a', 'b', etc. */
+  LITERAL,
+
+  /** '.' wildcard */
+  DOT,
+
+  /** '*' operator */
+  STAR,
+
+  /** '+' operator */
+  PLUS,
+
+  /** '?' operator */
+  QUESTION,
+
+  /** '|' alternation */
+  ALTERNATION,
+
+  /** '(' opening group */
+  LPAREN,
+
+  /** ')' closing group */
+  RPAREN,
+
+  /** '^' start anchor */
+  CARET,
+
+  /** '$' end anchor */
+  DOLLAR,
+
+  /** Character class: '[...]', \d, \w, \s, etc */
+  CHAR_CLASS,
+
+  /** Quantifier range: '{m,n}', '{m,}', '{m}' */
+  QUANTIFIER_RANGE,
+
+  /** End of pattern */
+  END,
+
+  /** Implicit concatenation */
+  CONCAT
+};
+
+/**
+ * @brief Represents a character range [lo, hi].
+ */
+struct CharRange {
+  /** Lower bound */
+  ut8 lo;
+
+  /** Upper bound */
+  ut8 hi;
+};
+
+/**
+ * @brief A single token in the regex.
+ */
+struct Token {
+  /** Token category */
+  TokenType type;
+  /** Position in pattern (for error reporting) */
+  size_t pos;
+  /** Group ID for parentheses */
+  st32 group_id = -1;
+
+  /** Literal character value */
+  ut8 literal = '\0';
+
+  /** Whether character class is negated */
+  bool negated = false;
+  /** Character ranges for character class */
+  std::vector<CharRange> ranges{};
+
+  /** Minimum repetitions for quantifier */
+  st32 min = 0;
+  /** Maximum repetitions (-1 means unbounded) */
+  st32 max = 0;
+};
+
+/**
+ * @brief Converts a regex pattern into a sequence of tokens.
+ */
+class Tokenizer {
+public:
+  /**
+   * @brief Construct tokenizer for a pattern.
+   * @param pat Regex pattern.
+   */
+  explicit Tokenizer(std::string_view pat);
+
+  /**
+   * @brief Tokenize the entire pattern.
+   * @return Vector of tokens ending with END token.
+   */
+  std::vector<Token> tokenize();
+
+private:
+  /** Input regex pattern */
+  std::string_view pattern;
+  /** Current cursor position */
+  size_t cursor_pos = 0;
+  /** Counter for assigning group IDs */
+  st32 group_counter = 0;
+  /** Stack for nested group tracking */
+  std::stack<st32> group_stack;
+
+  /** Peek next character without consuming */
+  ut8 peek() const;
+  /** Consume next character */
+  ut8 get();
+  /** Check for end of input */
+  bool eof() const;
+
+  /** Read next token */
+  Token next_token();
+  /** Read literal character */
+  Token read_literal(ut8);
+  /** Read escape sequence */
+  Token read_escape();
+  /** Read character class */
+  Token read_char_class();
+  /** Read quantifier range */
+  Token read_quantifier();
+
+  /** @brief Populates a token with ranges for \d, \w, \s, etc. */
+  void add_shorthand_ranges(ut8, Token &);
+
+  /** @brief Inserts implicit CONCAT tokens where concatenation occurs. */
+  void add_concat_tokens(std::vector<Token> &);
+
+  /** @brief Sorts and merges overlapping ranges for efficient NFA matching. */
+  void normalize_ranges(std::vector<CharRange> &);
+};
+
+#endif // REGEX_TOKENIZER_HPP
diff --git a/libpz/include/pz_cxx_std.hpp b/libpz/include/pz_cxx_std.hpp
@@ -11,6 +11,7 @@
 #include <optional>
 #include <set>
 #include <sstream>
+#include <stack>
 #include <string>
 #include <string_view>
 #include <unordered_map>