From 93facf192fd97ce77d313520995e01c4a96a2953 Mon Sep 17 00:00:00 2001 From: Keith MacDonald Date: Sat, 27 Dec 2025 14:03:48 +0000 Subject: [PATCH] Implemented UTF-16 surrogate pair matching using \x{dddddd}. --- .../boost/regex/v5/basic_regex_creator.hpp | 16 ++++++++ include/boost/regex/v5/basic_regex_parser.hpp | 39 +++++++++---------- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/include/boost/regex/v5/basic_regex_creator.hpp b/include/boost/regex/v5/basic_regex_creator.hpp index a8a1a1dca..b59ac2d02 100644 --- a/include/boost/regex/v5/basic_regex_creator.hpp +++ b/include/boost/regex/v5/basic_regex_creator.hpp @@ -227,6 +227,7 @@ class basic_regex_creator re_syntax_base* insert_state(std::ptrdiff_t pos, syntax_element_type t, std::size_t s); re_syntax_base* insert_state(std::ptrdiff_t pos, syntax_element_type t) { return insert_state(pos, t, sizeof(re_syntax_base)); } re_literal* append_literal(charT c); + re_literal* append_literal(std::uint32_t c32); re_syntax_base* append_set(const basic_char_set& char_set); re_syntax_base* append_set(const basic_char_set& char_set, std::integral_constant*); re_syntax_base* append_set(const basic_char_set& char_set, std::integral_constant*); @@ -352,6 +353,21 @@ re_literal* basic_regex_creator::append_literal(charT c) return result; } +template +re_literal* basic_regex_creator::append_literal(std::uint32_t c32) +{ + if (sizeof(charT) != 2 || (c32 & ~0xFFFFu) == 0) + return append_literal(static_cast(c32)); + + // Surrogate pair + const bool b = m_icase; + m_icase = false; + append_literal(static_cast(((c32 - 0x10000u) >> 10) + 0xD800u)); + re_literal* result = append_literal(static_cast((c32 & 0x3FFu) + 0xDC00u)); + m_icase = b; + return result; +} + template inline re_syntax_base* basic_regex_creator::append_set( const basic_char_set& char_set) diff --git a/include/boost/regex/v5/basic_regex_parser.hpp b/include/boost/regex/v5/basic_regex_parser.hpp index b7408dc42..99dc603df 100644 --- a/include/boost/regex/v5/basic_regex_parser.hpp +++ b/include/boost/regex/v5/basic_regex_parser.hpp @@ -89,7 +89,7 @@ class basic_regex_parser : public basic_regex_creator bool add_emacs_code(bool negate); bool unwind_alts(std::ptrdiff_t last_paren_start); digraph get_next_set_literal(basic_char_set& char_set); - charT unescape_character(); + std::uint32_t unescape_character(); regex_constants::syntax_option_type parse_options(); private: @@ -1678,7 +1678,7 @@ digraph basic_regex_parser::get_next_set_literal(basic_cha break; } ++m_position; - result = unescape_character(); + result = static_cast(unescape_character()); break; case regex_constants::syntax_open_set: { @@ -1761,13 +1761,13 @@ bool valid_value(charT c, std::intmax_t v) } template -charT basic_regex_parser::unescape_character() +std::uint32_t basic_regex_parser::unescape_character() { #ifdef BOOST_REGEX_MSVC #pragma warning(push) #pragma warning(disable:4127) #endif - charT result(0); + std::uint32_t result(0); if(m_position == m_end) { fail(regex_constants::error_escape, m_position - m_base, "Escape sequence terminated prematurely."); @@ -1776,28 +1776,28 @@ charT basic_regex_parser::unescape_character() switch(this->m_traits.escape_syntax_type(*m_position)) { case regex_constants::escape_type_control_a: - result = charT('\a'); + result = static_cast('\a'); break; case regex_constants::escape_type_e: - result = charT(27); + result = static_cast(27); break; case regex_constants::escape_type_control_f: - result = charT('\f'); + result = static_cast('\f'); break; case regex_constants::escape_type_control_n: - result = charT('\n'); + result = static_cast('\n'); break; case regex_constants::escape_type_control_r: - result = charT('\r'); + result = static_cast('\r'); break; case regex_constants::escape_type_control_t: - result = charT('\t'); + result = static_cast('\t'); break; case regex_constants::escape_type_control_v: - result = charT('\v'); + result = static_cast('\v'); break; case regex_constants::escape_type_word_assert: - result = charT('\b'); + result = static_cast('\b'); break; case regex_constants::escape_type_ascii_control: ++m_position; @@ -1809,7 +1809,7 @@ charT basic_regex_parser::unescape_character() fail(regex_constants::error_escape, m_position - m_base, "ASCII escape sequence terminated prematurely."); return result; } - result = static_cast(*m_position % 32); + result = static_cast(*m_position % 32); break; case regex_constants::escape_type_hex: ++m_position; @@ -1835,8 +1835,8 @@ charT basic_regex_parser::unescape_character() } std::intmax_t i = this->m_traits.toi(m_position, m_end, 16); if((m_position == m_end) - || (i < 0) - || ((std::numeric_limits::is_specialized) && (i > (std::intmax_t)(std::numeric_limits::max)())) + || (i < 0 || i > 0x10FFFF) + || (sizeof(charT) == 1 && (std::numeric_limits::is_specialized) && (i > (std::intmax_t)(std::numeric_limits::max)())) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace)) { // Rewind to start of escape: @@ -1846,7 +1846,7 @@ charT basic_regex_parser::unescape_character() return result; } ++m_position; - result = charT(i); + result = static_cast(i); } else { @@ -1861,7 +1861,7 @@ charT basic_regex_parser::unescape_character() fail(regex_constants::error_escape, m_position - m_base, "Escape sequence did not encode a valid character."); return result; } - result = charT(i); + result = static_cast(i); } return result; case regex_constants::syntax_digit: @@ -1939,7 +1939,7 @@ charT basic_regex_parser::unescape_character() return false; } default: - result = *m_position; + result = static_cast(*m_position); break; } ++m_position; @@ -1958,8 +1958,7 @@ bool basic_regex_parser::parse_backref() if((i == 0) || (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs))) { // not a backref at all but an octal escape sequence: - charT c = unescape_character(); - this->append_literal(c); + this->append_literal(unescape_character()); } else if((i > 0)) {