From 925b1a10a9562c3c2ce67b8d4b224deeb615707f Mon Sep 17 00:00:00 2001 From: Sanskar Date: Mon, 27 Oct 2025 23:30:04 +0530 Subject: [PATCH] Add .hpp header for regex. --- libpz/regex_temp/NFA.hpp | 186 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 libpz/regex_temp/NFA.hpp diff --git a/libpz/regex_temp/NFA.hpp b/libpz/regex_temp/NFA.hpp new file mode 100644 index 0000000..730961b --- /dev/null +++ b/libpz/regex_temp/NFA.hpp @@ -0,0 +1,186 @@ +#ifndef PZ_REGEX_HPP +#define PZ_REGEX_HPP + +#include +#include +#include + +/** @brief namespace PzRegex */ +namespace PzRegex { +enum class StateType; // NFA state types +enum class AssertionType; // Assertion types for regex anchors +struct CharClass; // Character class representation +struct State; // NFA state node +struct PtrList; // Linked list for patching transitions +struct Frag; // NFA fragment during construction +struct CaptureGroup; // Capture group information +class NFABuilder; // NFA construction from postfix regex +class NFASimulator; // NFA simulation engine +}; // namespace PzRegex + +/** + * @name State type enumeration + * @brief Different types of states in the NFA. + * @details + * Each state type represents a different operation in the regex: + * character matching, epsilon transitions, accepting states, etc. + */ +enum class PzRegex::StateType : st32 { + STATE_CHAR = 0, // Consume a character + STATE_SPLIT, // ε-transition (alternation) + STATE_MATCH, // Accepting state + STATE_CHARCLASS, // Character class [a-z] + STATE_ASSERTION, // ^, $, \b + STATE_CAPTURE_START, // Start of capture group + STATE_CAPTURE_END // End of capture group +}; + +/** + * @name Assertion type enumeration + * @brief Types of zero-width assertions in regex patterns. + * @details + * Assertions match positions rather than characters. + */ +enum class PzRegex::AssertionType : st32 { + ASSERT_NONE = 0, // No assertion + ASSERT_START_LINE, // ^ - Start of line + ASSERT_END_LINE, // $ - End of line + ASSERT_WORD_BOUND // \b - Word boundary +}; + +// Convenience type aliases +using PzStateType = PzRegex::StateType; +using PzAssertionType = PzRegex::AssertionType; + +/** + * @brief Character class for pattern matching + */ +namespace PzRegex { +struct CharClass { + std::set chars; // Characters in the class + bool negated = false; // If true, matches characters NOT in set + + CharClass() = default; + + void addRange(char start, char end); // Add character range [start-end] + void addChar(char c); // Add single character + bool matches(char c) const; // Check if character matches class +}; + +/** + * @brief NFA state node + */ +struct State { + StateType type; // Type of state + st32 c; // Character for STATE_CHAR + State *out = nullptr; // First transition (raw pointer) + State *out1 = nullptr; // Second transition (raw pointer for SPLIT) + st32 lastlist = 0; // Bookkeeping for simulation + + // Extended features + std::unique_ptr charClass = nullptr; // For STATE_CHARCLASS + AssertionType assertion = ASSERT_NONE; // For STATE_ASSERTION + st32 captureIndex = -1; // For capture groups + bool greedy = true; // Greedy vs non-greedy matching + + State(StateType t, st32 ch = 0, State *o = nullptr, State *o1 = nullptr); + + State(st32 t, State *o = nullptr, State *o1 = nullptr); +}; + +/** + * @brief Linked list of state pointers for patching + */ +struct PtrList { + State **p; // Pointer to a State raw pointer + std::unique_ptr next; // Next item in list + + PtrList(State **ptr, std::unique_ptr n = nullptr); +}; + +/** + * @brief NFA fragment during construction + */ +struct Frag { + std::unique_ptr start; // Start state of fragment (owned) + std::unique_ptr out; // Dangling output pointers + + Frag(std::unique_ptr s = nullptr, + std::unique_ptr o = nullptr); +}; + +/** + * @brief Captured group information + */ +struct CaptureGroup { + st32 start_pos; // Starting position + st32 end_pos; // Ending position + std::string text; // Captured text +}; + +/** + * @brief NFA builder from postfix regex + */ +class NFABuilder { +private: + State *matchstate_; // The accepting state (non-owning) + st32 next_capture_index_ = 0; // Counter for capture groups + + static void patch(PtrList *l, State *s); // Patch dangling pointers + static std::unique_ptr + append(std::unique_ptr l1, + std::unique_ptr l2); // Append PtrLists + +public: + NFABuilder(); + + std::unique_ptr + build(const std::string &postfix); // Build NFA from postfix regex + State *get_match_state() const; // Get match state + st32 get_capture_count() const; // Get capture group count +}; + +/** + * @brief NFA simulation engine for pattern matching + */ +class NFASimulator { +private: + struct ListItem { + State *state; // Non-owning pointer to state + std::vector caps; + }; + + struct List { + std::vector items; + void clear(); + }; + + State *start_; // Start state (non-owning) + State *matchstate_; // Match state (non-owning) + st32 listid_ = 0; // Generation counter + List l1_, l2_; // Current and next state lists + st32 num_capture_groups_; // Number of capture groups + std::vector captures_; // Storage for captures + + void addState(List *l, State *s, const std::string &input, st32 pos, + std::vector caps); // Add state to list + bool checkAssertion(State *s, const std::string &input, + st32 pos); // Check assertion at position + bool isWordBoundary(const std::string &input, + st32 pos); // Check word boundary + List *startList(State *s, List *l, st32 pos); // Initialize state list + void step(List *clist, const std::string &input, st32 pos, + List *nlist); // Execute simulation step + bool isMatch(List *l, const std::string &input, + st32 pos); // Check for match state + +public: + NFASimulator(State *start, State *match, st32 numCaptures); + + bool match(const std::string &s); // Match string against NFA + std::string get_capture(st32 index) const; // Get captured text +}; + +} // namespace PzRegex + +#endif // PZ_REGEX_HPP \ No newline at end of file