diff --git a/crates/lib-core/src/dialects.rs b/crates/lib-core/src/dialects.rs index ce85e45ac..1fd8d1045 100644 --- a/crates/lib-core/src/dialects.rs +++ b/crates/lib-core/src/dialects.rs @@ -5,12 +5,14 @@ pub mod syntax; use std::borrow::Cow; use std::collections::hash_map::Entry; use std::fmt::Debug; +use std::sync::{Arc, OnceLock}; use ahash::{AHashMap, AHashSet}; use crate::dialects::init::DialectKind; use crate::dialects::syntax::SyntaxKind; use crate::helpers::ToMatchable; +use crate::parser::compiled::{CompileError, CompiledGrammar}; use crate::parser::lexer::{Lexer, Matcher}; use crate::parser::matchable::Matchable; use crate::parser::parsers::StringParser; @@ -24,6 +26,7 @@ pub struct Dialect { sets: AHashMap<&'static str, AHashSet<&'static str>>, pub bracket_collections: AHashMap<&'static str, AHashSet>, lexer: Option, + compiled_grammar: Arc>>, } impl PartialEq for Dialect { @@ -33,6 +36,10 @@ impl PartialEq for Dialect { } impl Dialect { + fn invalidate_compiled_grammar(&mut self) { + self.compiled_grammar = Arc::new(OnceLock::new()); + } + pub fn new() -> Self { Dialect { name: DialectKind::Ansi, @@ -45,6 +52,7 @@ impl Dialect { } pub fn add(&mut self, iter: impl IntoIterator, DialectElementType)>) { + self.invalidate_compiled_grammar(); self.library.extend(iter); } @@ -63,6 +71,7 @@ impl Dialect { #[track_caller] pub fn replace_grammar(&mut self, name: &'static str, match_grammar: Matchable) { + self.invalidate_compiled_grammar(); match self.library.entry(Cow::Borrowed(name)) { Entry::Occupied(entry) => { let target = entry.into_mut(); @@ -90,6 +99,7 @@ impl Dialect { } pub fn insert_lexer_matchers(&mut self, lexer_patch: Vec, before: &str) { + self.invalidate_compiled_grammar(); assert!( !self.lexer_matchers.is_empty(), "Lexer struct must be defined before it can be patched!" @@ -119,6 +129,7 @@ impl Dialect { } pub fn patch_lexer_matchers(&mut self, lexer_patch: Vec) { + self.invalidate_compiled_grammar(); assert!( !self.lexer_matchers.is_empty(), "Lexer struct must be defined before it can be patched!" @@ -143,6 +154,7 @@ impl Dialect { } pub fn set_lexer_matchers(&mut self, lexer_matchers: Vec) { + self.invalidate_compiled_grammar(); self.lexer_matchers = lexer_matchers; } @@ -158,6 +170,7 @@ impl Dialect { } pub fn sets_mut(&mut self, label: &'static str) -> &mut AHashSet<&'static str> { + self.invalidate_compiled_grammar(); assert!( label != "bracket_pairs" && label != "angle_bracket_pairs", "Use `bracket_sets` to retrieve {label} set." @@ -174,11 +187,13 @@ impl Dialect { set_label: &'static str, values: &'static str, ) { + self.invalidate_compiled_grammar(); let keywords = values.lines().map(str::trim); self.sets_mut(set_label).extend(keywords); } pub fn add_keyword_to_set(&mut self, set_label: &'static str, value: &'static str) { + self.invalidate_compiled_grammar(); self.sets_mut(set_label).insert(value); } @@ -195,6 +210,7 @@ impl Dialect { } pub fn bracket_sets_mut(&mut self, label: &'static str) -> &mut AHashSet { + self.invalidate_compiled_grammar(); assert!( label == "bracket_pairs" || label == "angle_bracket_pairs", "Invalid bracket set. Consider using another identifier instead." @@ -204,6 +220,7 @@ impl Dialect { } pub fn update_bracket_sets(&mut self, label: &'static str, pairs: Vec) { + self.invalidate_compiled_grammar(); let set = self.bracket_sets_mut(label); for pair in pairs { set.insert(pair); @@ -223,7 +240,28 @@ impl Dialect { } } + pub(crate) fn segment_generator_names(&self) -> Vec> { + self.library + .iter() + .filter_map(|(name, elem)| match elem { + DialectElementType::Matchable(_) => None, + DialectElementType::SegmentGenerator(_) => Some(name.clone()), + }) + .collect() + } + + pub(crate) fn matchable_entries(&self) -> Vec<(Cow<'static, str>, Matchable)> { + self.library + .iter() + .filter_map(|(name, elem)| match elem { + DialectElementType::Matchable(matchable) => Some((name.clone(), matchable.clone())), + DialectElementType::SegmentGenerator(_) => None, + }) + .collect() + } + pub fn expand(&mut self) { + self.invalidate_compiled_grammar(); // Temporarily take ownership of 'library' from 'self' to avoid borrow checker // errors during mutation. let mut library = std::mem::take(&mut self.library); @@ -259,6 +297,16 @@ impl Dialect { pub fn lexer(&self) -> &Lexer { self.lexer.as_ref().unwrap() } + + pub fn compile_grammar(&self) -> Result<&CompiledGrammar, CompileError> { + match self + .compiled_grammar + .get_or_init(|| CompiledGrammar::from_dialect(self)) + { + Ok(grammar) => Ok(grammar), + Err(err) => Err(err.clone()), + } + } } pub type BracketPair = (&'static str, &'static str, &'static str, bool); diff --git a/crates/lib-core/src/parser.rs b/crates/lib-core/src/parser.rs index 8adad9872..fe4f4cdf6 100644 --- a/crates/lib-core/src/parser.rs +++ b/crates/lib-core/src/parser.rs @@ -1,9 +1,9 @@ -pub mod context; +pub mod compiled; pub mod grammar; pub mod lexer; pub mod lookahead; pub mod markers; -pub mod match_algorithms; +pub(crate) mod match_algorithms; pub mod match_result; pub mod matchable; pub mod node_matcher; @@ -13,8 +13,6 @@ pub mod types; use crate::dialects::Dialect; use crate::errors::SQLParseError; -use crate::parser::segments::file::FileSegment; -use context::ParseContext; use segments::{ErasedSegment, Tables}; #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] @@ -117,20 +115,23 @@ impl<'a> Parser<'a> { return Ok(None); } - // NOTE: This is the only time we use the parse context not in the - // context of a context manager. That's because it's the initial - // instantiation. - let mut parse_cx: ParseContext = self.into(); - - // Kick off parsing with the root segment. The BaseFileSegment has - // a unique entry point to facilitate exaclty this. All other segments - // will use the standard .match()/.parse() route. - let root = - FileSegment.root_parse(tables, parse_cx.dialect().name, segments, &mut parse_cx)?; + let compiled = self + .dialect + .compile_grammar() + .map_err(|err| SQLParseError { + description: format!("Failed to compile grammar: {err}"), + segment: None, + })?; + let root = compiled.root_parse_file( + tables, + self.dialect.name(), + self.dialect, + segments, + self.indentation_config, + )?; #[cfg(debug_assertions)] { - // Basic Validation, that we haven't dropped anything. let join_segments_raw = |segments: &[ErasedSegment]| { smol_str::SmolStr::from_iter(segments.iter().map(|s| s.raw().as_str())) }; @@ -140,4 +141,28 @@ impl<'a> Parser<'a> { Ok(root.into()) } + + pub fn parse_as( + &self, + tables: &Tables, + root_name: &str, + segments: &[ErasedSegment], + ) -> Result, SQLParseError> { + let compiled = self + .dialect + .compile_grammar() + .map_err(|err| SQLParseError { + description: format!("Failed to compile grammar: {err}"), + segment: None, + })?; + + compiled.root_parse_as( + tables, + self.dialect.name(), + self.dialect, + root_name, + segments, + self.indentation_config, + ) + } } diff --git a/crates/lib-core/src/parser/compiled.rs b/crates/lib-core/src/parser/compiled.rs new file mode 100644 index 000000000..892b8c57a --- /dev/null +++ b/crates/lib-core/src/parser/compiled.rs @@ -0,0 +1,3151 @@ +use std::ops::Deref; +use std::rc::Rc; + +use ahash::{AHashMap, AHashSet}; +use fancy_regex::Regex; +use itertools::{Itertools as _, enumerate, multiunzip}; +use rustc_hash::FxHashMap; +use smol_str::SmolStr; +use thiserror::Error; + +use super::IndentationConfig; +use super::match_algorithms::{ + first_non_whitespace, first_trimmed_raw, skip_start_index_forward_to_code, + skip_stop_index_backward_to_code, +}; +use super::match_result::{MatchResult, Matched, Span}; +use super::matchable::{Matchable, MatchableTrait, MatchableTraitImpl}; +use super::segments::{ErasedSegment, SegmentBuilder, Tables}; +use crate::dialects::Dialect; +use crate::dialects::init::DialectKind; +use crate::dialects::syntax::{SyntaxKind, SyntaxSet}; +use crate::errors::SQLParseError; +use crate::helpers::IndexSet; + +pub type SymbolId = u32; + +type LocKey = u32; +type LocKeyData = (usize, usize, SyntaxKind, u32); +type BracketMatch = Result<(MatchResult, Option, Vec), SQLParseError>; +type SimpleSet = (AHashSet, SyntaxSet); + +#[derive(Default)] +struct NextMatchPrepared { + raw_simple_map: AHashMap>, + type_simple_map: AHashMap>, + type_simple_keys: SyntaxSet, +} + +struct NextMatchScratch { + matcher_idxs: Vec, + visited: Vec, + visit_stamp: u32, + raw_keys: Vec>, + raw_key_addrs: Vec, +} + +impl NextMatchScratch { + fn new(matcher_count: usize) -> Self { + Self { + matcher_idxs: Vec::new(), + visited: vec![0_u32; matcher_count], + visit_stamp: 1, + raw_keys: Vec::new(), + raw_key_addrs: Vec::new(), + } + } + + fn ensure_matcher_capacity(&mut self, matcher_count: usize) { + if self.visited.len() < matcher_count { + self.visited.resize(matcher_count, 0); + } + } + + fn ensure_segment_capacity(&mut self, segment_count: usize) { + if self.raw_keys.len() < segment_count { + self.raw_keys.resize_with(segment_count, || None); + self.raw_key_addrs.resize(segment_count, 0); + } + } +} + +struct NextExBracketPrepared { + start_brackets: Vec, + end_brackets: Vec, + bracket_persists: Vec, + all_matchers: Vec, + next_match_prepared: NextMatchPrepared, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct NodeId(pub u32); + +impl NodeId { + #[inline] + pub fn as_usize(self) -> usize { + self.0 as usize + } +} + +#[repr(u8)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum Kind { + Sequence, + OneOf, + AnyNumberOf, + Ref, + NodeMatcher, + String, + MultiString, + Regex, + Typed, + Code, + NonCode, + Nothing, + Anything, + Delimited, + Bracketed, + Meta, + Conditional, + BracketedSegmentMatcher, + LookaheadExclude, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Node { + pub kind: Kind, + pub a: u32, + pub b: u32, +} + +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +struct NodeSlice { + start: u32, + len: u32, +} + +impl NodeSlice { + #[inline] + fn is_empty(self) -> bool { + self.len == 0 + } + + #[inline] + fn as_slice(self, kids: &[NodeId]) -> &[NodeId] { + &kids[self.start as usize..(self.start + self.len) as usize] + } +} + +#[derive(Debug, Clone)] +struct SequencePayload { + parse_mode: ParseMode, + allow_gaps: bool, + optional: bool, + terminators: NodeSlice, +} + +#[derive(Debug, Clone)] +struct AnyNumberOfPayload { + exclude: Option, + terminators: NodeSlice, + reset_terminators: bool, + max_times: Option, + min_times: usize, + max_times_per_element: Option, + allow_gaps: bool, + optional: bool, + parse_mode: ParseMode, +} + +#[derive(Debug, Clone)] +struct RefPayload { + symbol: SymbolId, + exclude: Option, + terminators: NodeSlice, + reset_terminators: bool, + optional: bool, + resolved: Option, +} + +#[derive(Debug, Clone, Copy)] +struct NodeMatcherPayload { + node_kind: SyntaxKind, + child: NodeId, +} + +#[derive(Debug, Clone)] +struct StringPayload { + template: u32, + kind: SyntaxKind, + optional: bool, +} + +#[derive(Debug, Clone)] +struct MultiStringPayload { + templates: NodeSlice, + kind: SyntaxKind, +} + +#[derive(Debug, Clone, Copy)] +struct RegexPayload { + regex_id: u32, + kind: SyntaxKind, +} + +#[derive(Debug, Clone, Copy)] +struct TypedPayload { + template: SyntaxKind, + kind: SyntaxKind, + optional: bool, +} + +#[derive(Debug, Clone)] +struct AnythingPayload { + terminators: NodeSlice, +} + +#[derive(Debug, Clone)] +struct DelimitedPayload { + allow_trailing: bool, + delimiter: NodeId, + min_delimiters: usize, + optional_delimiter: bool, + optional: bool, + allow_gaps: bool, + terminators: NodeSlice, +} + +#[derive(Debug, Clone)] +struct BracketedPayload { + bracket_type: SymbolId, + bracket_pairs_set: SymbolId, + allow_gaps: bool, + parse_mode: ParseMode, + inner: NodeId, +} + +#[derive(Debug, Clone, Copy)] +struct MetaPayload { + kind: SyntaxKind, +} + +#[derive(Debug, Clone, Copy)] +struct ConditionalPayload { + meta: SyntaxKind, + requirements: IndentationConfig, +} + +#[derive(Debug, Clone, Copy)] +struct LookaheadExcludePayload { + first_token: u32, + lookahead_token: u32, +} + +#[derive(Debug, Clone)] +struct RegexEntry { + regex: Regex, + anti_regex: Option, +} + +#[derive(Debug, Clone)] +enum Payload { + None, + Sequence(SequencePayload), + AnyNumberOf(AnyNumberOfPayload), + Ref(RefPayload), + NodeMatcher(NodeMatcherPayload), + String(StringPayload), + MultiString(MultiStringPayload), + Regex(RegexPayload), + Typed(TypedPayload), + Anything(AnythingPayload), + Delimited(DelimitedPayload), + Bracketed(BracketedPayload), + Meta(MetaPayload), + Conditional(ConditionalPayload), + LookaheadExclude(LookaheadExcludePayload), +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ParseMode { + Strict, + Greedy, + GreedyOnceStarted, +} + +impl From for ParseMode { + fn from(value: super::types::ParseMode) -> Self { + match value { + super::types::ParseMode::Strict => Self::Strict, + super::types::ParseMode::Greedy => Self::Greedy, + super::types::ParseMode::GreedyOnceStarted => Self::GreedyOnceStarted, + } + } +} + +#[derive(Debug, Error, Clone)] +pub enum CompileError { + #[error("dialect still contains SegmentGenerator for '{0}'")] + SegmentGenerator(String), + #[error("missing grammar reference '{0}'")] + MissingReference(String), + #[error("unsupported grammar shape: {0}")] + Unsupported(String), +} + +#[derive(Debug, PartialEq, Eq, Hash)] +struct CacheKey { + loc: LocKey, + key: u32, +} + +impl CacheKey { + #[inline] + fn new(loc: LocKey, key: u32) -> Self { + Self { loc, key } + } +} + +#[derive(Debug)] +struct CompiledParseContext<'a> { + grammar: &'a CompiledGrammar, + dialect: &'a Dialect, + terminators: Vec, + loc_keys: IndexSet, + parse_cache: FxHashMap, + simple_cache: FxHashMap>>, + indentation_config: IndentationConfig, +} + +impl<'a> CompiledParseContext<'a> { + fn new( + grammar: &'a CompiledGrammar, + dialect: &'a Dialect, + indentation_config: IndentationConfig, + ) -> Self { + Self { + grammar, + dialect, + terminators: Vec::new(), + loc_keys: IndexSet::default(), + parse_cache: FxHashMap::default(), + simple_cache: FxHashMap::default(), + indentation_config, + } + } + + #[inline] + fn deeper_match( + &mut self, + clear_terminators: bool, + push_terminators: &[NodeId], + f: impl FnOnce(&mut Self) -> T, + ) -> T { + let (appended, terms) = self.set_terminators(clear_terminators, push_terminators); + let ret = f(self); + self.reset_terminators(appended, terms, clear_terminators); + ret + } + + fn set_terminators( + &mut self, + clear_terminators: bool, + push_terminators: &[NodeId], + ) -> (usize, Vec) { + let mut appended = 0; + let terminators = if clear_terminators { + self.terminators.clone() + } else { + Vec::new() + }; + + if clear_terminators && !self.terminators.is_empty() { + self.terminators = if !push_terminators.is_empty() { + push_terminators.to_vec() + } else { + Vec::new() + }; + } else if !push_terminators.is_empty() { + for &terminator in push_terminators { + let already_present = self.terminators.iter().any(|&existing| { + self.grammar.node_eq_group(existing) == self.grammar.node_eq_group(terminator) + }); + + if !already_present { + self.terminators.push(terminator); + appended += 1; + } + } + } + + (appended, terminators) + } + + fn reset_terminators( + &mut self, + appended: usize, + terminators: Vec, + clear_terminators: bool, + ) { + if clear_terminators { + self.terminators = terminators; + } else { + let new_len = self.terminators.len().saturating_sub(appended); + self.terminators.truncate(new_len); + } + } + + #[inline] + fn loc_key(&mut self, data: LocKeyData) -> LocKey { + let (key, _) = self.loc_keys.insert_full(data); + key as u32 + } + + #[inline] + fn check_parse_cache(&self, loc_key: LocKey, matcher_key: u32) -> Option<&MatchResult> { + self.parse_cache.get(&CacheKey::new(loc_key, matcher_key)) + } + + #[inline] + fn put_parse_cache( + &mut self, + loc_key: LocKey, + matcher_key: u32, + match_result: MatchResult, + ) -> &MatchResult { + self.parse_cache + .entry(CacheKey::new(loc_key, matcher_key)) + .or_insert(match_result) + } +} + +#[derive(Debug, Clone)] +pub struct CompiledGrammar { + nodes: Vec, + payloads: Vec, + node_eq_groups: Vec, + next_node_eq_group: u32, + kids: Vec, + symbols: Vec, + symbol_index: AHashMap, + definitions: Vec>, + strings: Vec, + string_index: AHashMap, + regexes: Vec, + regex_index: AHashMap<(SmolStr, Option), u32>, + builtin_non_code: NodeId, + compiled: bool, +} + +impl Default for CompiledGrammar { + fn default() -> Self { + Self::new() + } +} + +impl CompiledGrammar { + pub fn new() -> Self { + let mut this = Self { + nodes: Vec::new(), + payloads: Vec::new(), + node_eq_groups: Vec::new(), + next_node_eq_group: 0, + kids: Vec::new(), + symbols: Vec::new(), + symbol_index: AHashMap::new(), + definitions: Vec::new(), + strings: Vec::new(), + string_index: AHashMap::new(), + regexes: Vec::new(), + regex_index: AHashMap::new(), + builtin_non_code: NodeId(0), + compiled: false, + }; + + let builtin_non_code = this.push_node(Kind::NonCode, 0, 0, Payload::None); + this.builtin_non_code = builtin_non_code; + this + } + + pub fn from_dialect(dialect: &Dialect) -> Result { + let mut compiler = LegacyCompiler { + dialect, + grammar: Self::new(), + seen: AHashMap::new(), + eq_representatives: Vec::new(), + next_eq_group: 0, + }; + + if let Some(name) = dialect.segment_generator_names().into_iter().next() { + return Err(CompileError::SegmentGenerator(name.into_owned())); + } + + for (name, matcher) in dialect.matchable_entries() { + let node_id = compiler.compile_matchable(&matcher)?; + compiler.grammar.define(name.as_ref(), node_id); + } + + compiler.grammar.compile() + } + + pub fn compile(mut self) -> Result { + self.resolve_refs()?; + self.normalize(); + self.compiled = true; + Ok(self) + } + + pub fn define(&mut self, name: impl Into, node: NodeId) { + let name: SmolStr = name.into(); + let symbol = self.intern_symbol(name.as_str()); + self.ensure_definitions_len(symbol); + self.definitions[symbol as usize] = Some(node); + } + + pub fn ref_(&mut self, name: impl AsRef) -> NodeId { + let symbol = self.intern_symbol(name.as_ref()); + let payload = RefPayload { + symbol, + exclude: None, + terminators: NodeSlice::default(), + reset_terminators: false, + optional: false, + resolved: None, + }; + + self.push_node(Kind::Ref, symbol, 0, Payload::Ref(payload)) + } + + pub fn keyword(&mut self, keyword: impl AsRef) -> NodeId { + let keyword = keyword.as_ref().to_ascii_uppercase(); + self.string(&keyword, SyntaxKind::Keyword) + } + + pub fn string(&mut self, template: impl AsRef, kind: SyntaxKind) -> NodeId { + let template_id = self.intern_string(template.as_ref().to_ascii_uppercase()); + let payload = StringPayload { + template: template_id, + kind, + optional: false, + }; + + self.push_node( + Kind::String, + template_id, + kind as u32, + Payload::String(payload), + ) + } + + pub fn regex( + &mut self, + pattern: impl AsRef, + anti_pattern: Option>, + kind: SyntaxKind, + ) -> NodeId { + let anti_pattern = anti_pattern.map(|it| it.as_ref().to_owned()); + let regex_id = self.intern_regex(pattern.as_ref(), anti_pattern.as_deref()); + let payload = RegexPayload { regex_id, kind }; + + self.push_node(Kind::Regex, regex_id, kind as u32, Payload::Regex(payload)) + } + + pub fn typed(&mut self, template: SyntaxKind, kind: SyntaxKind) -> NodeId { + let payload = TypedPayload { + template, + kind, + optional: false, + }; + + self.push_node( + Kind::Typed, + template as u32, + kind as u32, + Payload::Typed(payload), + ) + } + + pub fn code(&mut self) -> NodeId { + self.push_node(Kind::Code, 0, 0, Payload::None) + } + + pub fn non_code(&self) -> NodeId { + self.builtin_non_code + } + + pub fn nothing(&mut self) -> NodeId { + self.push_node(Kind::Nothing, 0, 0, Payload::None) + } + + pub fn sequence(&mut self, children: I) -> NodeId + where + I: IntoIterator, + { + let slice = self.push_children(children); + let payload = SequencePayload { + parse_mode: ParseMode::Strict, + allow_gaps: true, + optional: false, + terminators: NodeSlice::default(), + }; + self.push_node( + Kind::Sequence, + slice.start, + slice.len, + Payload::Sequence(payload), + ) + } + + pub fn one_of(&mut self, children: I) -> NodeId + where + I: IntoIterator, + { + let slice = self.push_children(children); + let payload = AnyNumberOfPayload { + exclude: None, + terminators: NodeSlice::default(), + reset_terminators: false, + max_times: Some(1), + min_times: 1, + max_times_per_element: None, + allow_gaps: true, + optional: false, + parse_mode: ParseMode::Strict, + }; + + self.push_node( + Kind::OneOf, + slice.start, + slice.len, + Payload::AnyNumberOf(payload), + ) + } + + pub fn node_matcher(&mut self, kind: SyntaxKind, child: NodeId) -> NodeId { + let payload = NodeMatcherPayload { + node_kind: kind, + child, + }; + + self.push_node( + Kind::NodeMatcher, + kind as u32, + child.0, + Payload::NodeMatcher(payload), + ) + } + + pub fn root(&self, name: &str) -> Option { + let symbol = self.symbol_index.get(name)?; + self.definitions[*symbol as usize] + } + + pub fn root_parse_file( + &self, + tables: &Tables, + dialect: DialectKind, + dialect_ref: &Dialect, + segments: &[ErasedSegment], + indentation_config: IndentationConfig, + ) -> Result { + let start_idx = segments + .iter() + .position(|segment| segment.is_code()) + .unwrap_or(0) as u32; + + let end_idx = segments + .iter() + .rposition(|segment| segment.is_code()) + .map_or(start_idx, |idx| idx as u32 + 1); + + if start_idx == end_idx { + return Ok(SegmentBuilder::node( + tables.next_id(), + SyntaxKind::File, + dialect, + segments.to_vec(), + ) + .position_from_segments() + .finish()); + } + + let final_seg = segments.last().unwrap(); + assert!(final_seg.get_position_marker().is_some()); + + let file_node = self + .root("FileSegment") + .ok_or_else(|| SQLParseError::new("missing FileSegment root"))?; + + let entry = match self.node(file_node).kind { + Kind::NodeMatcher => self.node_matcher_payload(file_node).child, + _ => file_node, + }; + + let mut ctx = CompiledParseContext::new(self, dialect_ref, indentation_config); + let match_result = + self.match_node(entry, &segments[..end_idx as usize], start_idx, &mut ctx)?; + + let match_span = match_result.span; + let has_match = match_result.has_match(); + let mut matched = match_result.apply(tables, dialect, segments); + let unmatched = &segments[match_span.end as usize..end_idx as usize]; + + let content: &[ErasedSegment] = if !has_match { + &[SegmentBuilder::node( + tables.next_id(), + SyntaxKind::Unparsable, + dialect, + segments[start_idx as usize..end_idx as usize].to_vec(), + ) + .position_from_segments() + .finish()] + } else if !unmatched.is_empty() { + let idx = unmatched + .iter() + .position(|it| it.is_code()) + .unwrap_or(unmatched.len()); + let (head, tail) = unmatched.split_at(idx); + + matched.extend_from_slice(head); + matched.push( + SegmentBuilder::node( + tables.next_id(), + SyntaxKind::Unparsable, + dialect, + tail.to_vec(), + ) + .position_from_segments() + .finish(), + ); + &matched + } else { + matched.extend_from_slice(unmatched); + &matched + }; + + Ok(SegmentBuilder::node( + tables.next_id(), + SyntaxKind::File, + dialect, + [ + &segments[..start_idx as usize], + content, + &segments[end_idx as usize..], + ] + .concat(), + ) + .position_from_segments() + .finish()) + } + + pub fn root_parse_as( + &self, + tables: &Tables, + dialect: DialectKind, + dialect_ref: &Dialect, + root_name: &str, + segments: &[ErasedSegment], + indentation_config: IndentationConfig, + ) -> Result, SQLParseError> { + if segments.is_empty() { + return Ok(Vec::new()); + } + + let start_idx = segments + .iter() + .position(|segment| segment.is_code()) + .unwrap_or(0) as u32; + + let end_idx = segments + .iter() + .rposition(|segment| segment.is_code()) + .map_or(start_idx, |idx| idx as u32 + 1); + + if start_idx == end_idx { + return Ok(Vec::new()); + } + + let root_node = self + .root(root_name) + .ok_or_else(|| SQLParseError::new(format!("missing {root_name} root")))?; + + let mut ctx = CompiledParseContext::new(self, dialect_ref, indentation_config); + let match_result = self.match_node( + root_node, + &segments[..end_idx as usize], + start_idx, + &mut ctx, + )?; + + Ok(match_result.apply(tables, dialect, segments)) + } + + fn node(&self, id: NodeId) -> &Node { + &self.nodes[id.as_usize()] + } + + fn payload(&self, id: NodeId) -> &Payload { + &self.payloads[id.as_usize()] + } + + fn node_children(&self, id: NodeId) -> &[NodeId] { + let node = self.node(id); + NodeSlice { + start: node.a, + len: node.b, + } + .as_slice(&self.kids) + } + + fn node_matcher_payload(&self, id: NodeId) -> NodeMatcherPayload { + match self.payload(id) { + Payload::NodeMatcher(payload) => *payload, + _ => unreachable!("node {:?} is not NodeMatcher", self.node(id).kind), + } + } + + fn sequence_payload(&self, id: NodeId) -> &SequencePayload { + match self.payload(id) { + Payload::Sequence(payload) => payload, + _ => unreachable!("node {:?} is not Sequence", self.node(id).kind), + } + } + + fn any_number_of_payload(&self, id: NodeId) -> &AnyNumberOfPayload { + match self.payload(id) { + Payload::AnyNumberOf(payload) => payload, + _ => unreachable!("node {:?} is not AnyNumberOf", self.node(id).kind), + } + } + + fn ref_payload(&self, id: NodeId) -> &RefPayload { + match self.payload(id) { + Payload::Ref(payload) => payload, + _ => unreachable!("node {:?} is not Ref", self.node(id).kind), + } + } + + fn string_payload(&self, id: NodeId) -> &StringPayload { + match self.payload(id) { + Payload::String(payload) => payload, + _ => unreachable!("node {:?} is not String", self.node(id).kind), + } + } + + fn multi_string_payload(&self, id: NodeId) -> &MultiStringPayload { + match self.payload(id) { + Payload::MultiString(payload) => payload, + _ => unreachable!("node {:?} is not MultiString", self.node(id).kind), + } + } + + fn regex_payload(&self, id: NodeId) -> RegexPayload { + match self.payload(id) { + Payload::Regex(payload) => *payload, + _ => unreachable!("node {:?} is not Regex", self.node(id).kind), + } + } + + fn typed_payload(&self, id: NodeId) -> TypedPayload { + match self.payload(id) { + Payload::Typed(payload) => *payload, + _ => unreachable!("node {:?} is not Typed", self.node(id).kind), + } + } + + fn anything_payload(&self, id: NodeId) -> &AnythingPayload { + match self.payload(id) { + Payload::Anything(payload) => payload, + _ => unreachable!("node {:?} is not Anything", self.node(id).kind), + } + } + + fn delimited_payload(&self, id: NodeId) -> &DelimitedPayload { + match self.payload(id) { + Payload::Delimited(payload) => payload, + _ => unreachable!("node {:?} is not Delimited", self.node(id).kind), + } + } + + fn bracketed_payload(&self, id: NodeId) -> &BracketedPayload { + match self.payload(id) { + Payload::Bracketed(payload) => payload, + _ => unreachable!("node {:?} is not Bracketed", self.node(id).kind), + } + } + + fn meta_payload(&self, id: NodeId) -> MetaPayload { + match self.payload(id) { + Payload::Meta(payload) => *payload, + _ => unreachable!("node {:?} is not Meta", self.node(id).kind), + } + } + + fn conditional_payload(&self, id: NodeId) -> ConditionalPayload { + match self.payload(id) { + Payload::Conditional(payload) => *payload, + _ => unreachable!("node {:?} is not Conditional", self.node(id).kind), + } + } + + fn lookahead_payload(&self, id: NodeId) -> LookaheadExcludePayload { + match self.payload(id) { + Payload::LookaheadExclude(payload) => *payload, + _ => unreachable!("node {:?} is not LookaheadExclude", self.node(id).kind), + } + } + + fn node_cache_key(&self, id: NodeId) -> u32 { + id.0 | 0x8000_0000 + } + + #[inline] + fn node_eq_group(&self, id: NodeId) -> u32 { + self.node_eq_groups[id.as_usize()] + } + + #[inline] + fn set_node_eq_group(&mut self, id: NodeId, group: u32) { + self.node_eq_groups[id.as_usize()] = group; + } + + fn symbol_name(&self, symbol: SymbolId) -> &str { + &self.symbols[symbol as usize] + } + + fn string_value(&self, id: u32) -> &str { + &self.strings[id as usize] + } + + fn get_definition_by_name(&self, name: &str) -> Option { + let symbol = self.symbol_index.get(name)?; + self.definitions[*symbol as usize] + } + + fn intern_symbol(&mut self, name: impl AsRef) -> SymbolId { + let name = SmolStr::new(name.as_ref()); + if let Some(&id) = self.symbol_index.get(&name) { + return id; + } + + let id = self.symbols.len() as u32; + self.symbols.push(name.clone()); + self.symbol_index.insert(name, id); + self.ensure_definitions_len(id); + id + } + + fn intern_string(&mut self, value: impl AsRef) -> u32 { + let value = SmolStr::new(value.as_ref()); + if let Some(&id) = self.string_index.get(&value) { + return id; + } + + let id = self.strings.len() as u32; + self.strings.push(value.clone()); + self.string_index.insert(value, id); + id + } + + fn intern_regex(&mut self, pattern: &str, anti_pattern: Option<&str>) -> u32 { + let key = (SmolStr::new(pattern), anti_pattern.map(SmolStr::new)); + if let Some(&id) = self.regex_index.get(&key) { + return id; + } + + let regex = Regex::new(pattern).unwrap(); + let anti_regex = anti_pattern.map(|it| Regex::new(it).unwrap()); + + let id = self.regexes.len() as u32; + self.regexes.push(RegexEntry { regex, anti_regex }); + self.regex_index.insert(key, id); + id + } + + fn ensure_definitions_len(&mut self, symbol: SymbolId) { + let required = symbol as usize + 1; + if self.definitions.len() < required { + self.definitions.resize(required, None); + } + } + + fn push_children(&mut self, children: I) -> NodeSlice + where + I: IntoIterator, + { + let start = self.kids.len() as u32; + self.kids.extend(children); + let len = self.kids.len() as u32 - start; + + NodeSlice { start, len } + } + + fn push_node(&mut self, kind: Kind, a: u32, b: u32, payload: Payload) -> NodeId { + let id = NodeId(self.nodes.len() as u32); + self.nodes.push(Node { kind, a, b }); + self.payloads.push(payload); + self.node_eq_groups.push(self.next_node_eq_group); + self.next_node_eq_group = self.next_node_eq_group.saturating_add(1); + id + } + + fn make_sequence( + &mut self, + children: Vec, + parse_mode: ParseMode, + allow_gaps: bool, + optional: bool, + terminators: Vec, + ) -> NodeId { + let child_slice = self.push_children(children); + let terminators = self.push_children(terminators); + + self.push_node( + Kind::Sequence, + child_slice.start, + child_slice.len, + Payload::Sequence(SequencePayload { + parse_mode, + allow_gaps, + optional, + terminators, + }), + ) + } + + fn make_any_number_of( + &mut self, + kind: Kind, + children: Vec, + payload: AnyNumberOfPayload, + ) -> NodeId { + let child_slice = self.push_children(children); + self.push_node( + kind, + child_slice.start, + child_slice.len, + Payload::AnyNumberOf(payload), + ) + } + + fn kids_slice(&self, slice: NodeSlice) -> &[NodeId] { + slice.as_slice(&self.kids) + } + + fn is_implicit_keyword_symbol(name: &str) -> bool { + !name.is_empty() + && name + .chars() + .all(|ch| ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_') + } + + fn resolve_refs(&mut self) -> Result<(), CompileError> { + let mut unresolved_symbols = AHashSet::new(); + for payload in &self.payloads { + let Payload::Ref(ref_payload) = payload else { + continue; + }; + + let unresolved = self + .definitions + .get(ref_payload.symbol as usize) + .copied() + .flatten() + .is_none(); + + if unresolved { + unresolved_symbols.insert(ref_payload.symbol); + } + } + + for symbol in unresolved_symbols { + if self + .definitions + .get(symbol as usize) + .copied() + .flatten() + .is_some() + { + continue; + } + + let symbol_name = self.symbol_name(symbol).to_owned(); + if Self::is_implicit_keyword_symbol(&symbol_name) { + let keyword_node = self.keyword(&symbol_name); + self.ensure_definitions_len(symbol); + self.definitions[symbol as usize] = Some(keyword_node); + } + } + + for payload in &mut self.payloads { + let Payload::Ref(ref_payload) = payload else { + continue; + }; + + let resolved = self + .definitions + .get(ref_payload.symbol as usize) + .copied() + .flatten(); + + ref_payload.resolved = resolved; + } + + Ok(()) + } + + fn normalize(&mut self) { + for node_idx in 0..self.nodes.len() { + let node_id = NodeId(node_idx as u32); + match self.node(node_id).kind { + Kind::Sequence => self.normalize_sequence(node_id), + Kind::OneOf => self.normalize_one_of(node_id), + _ => {} + } + } + } + + fn normalize_sequence(&mut self, node_id: NodeId) { + let payload = self.sequence_payload(node_id).clone(); + let children = self.node_children(node_id); + let mut flattened: Option> = None; + + for (idx, child) in children.iter().copied().enumerate() { + if self.node(child).kind == Kind::Nothing { + flattened.get_or_insert_with(|| { + let mut out = Vec::with_capacity(children.len()); + out.extend_from_slice(&children[..idx]); + out + }); + continue; + } + + let can_flatten = self.node(child).kind == Kind::Sequence + && self + .payload(child) + .as_sequence() + .is_some_and(|child_payload| { + child_payload.parse_mode == payload.parse_mode + && child_payload.allow_gaps == payload.allow_gaps + && !child_payload.optional + && child_payload.terminators.is_empty() + }); + + if can_flatten { + let flattened = flattened.get_or_insert_with(|| { + let mut out = Vec::with_capacity(children.len()); + out.extend_from_slice(&children[..idx]); + out + }); + flattened.extend_from_slice(self.node_children(child)); + } else if let Some(flattened) = flattened.as_mut() { + flattened.push(child); + } + } + + let Some(flattened) = flattened else { + return; + }; + + let child_slice = self.push_children(flattened); + self.nodes[node_id.as_usize()].a = child_slice.start; + self.nodes[node_id.as_usize()].b = child_slice.len; + } + + fn normalize_one_of(&mut self, node_id: NodeId) { + let payload = self.any_number_of_payload(node_id).clone(); + if payload.exclude.is_some() + || payload.reset_terminators + || payload.max_times != Some(1) + || payload.min_times != 1 + || payload.max_times_per_element.is_some() + || !payload.allow_gaps + || payload.optional + || payload.parse_mode != ParseMode::Strict + || !payload.terminators.is_empty() + { + return; + } + + let children = self.node_children(node_id); + let mut flattened: Option> = None; + + for (idx, child) in children.iter().copied().enumerate() { + let is_plain_one_of = self.node(child).kind == Kind::OneOf + && self + .payload(child) + .as_any_number_of() + .is_some_and(|child_payload| { + child_payload.exclude.is_none() + && !child_payload.reset_terminators + && child_payload.max_times == Some(1) + && child_payload.min_times == 1 + && child_payload.max_times_per_element.is_none() + && child_payload.allow_gaps + && !child_payload.optional + && child_payload.parse_mode == ParseMode::Strict + && child_payload.terminators.is_empty() + }); + + if is_plain_one_of { + let flattened = flattened.get_or_insert_with(|| { + let mut out = Vec::with_capacity(children.len()); + out.extend_from_slice(&children[..idx]); + out + }); + flattened.extend_from_slice(self.node_children(child)); + } else if let Some(flattened) = flattened.as_mut() { + flattened.push(child); + } + } + + let Some(flattened) = flattened else { + return; + }; + + let child_slice = self.push_children(flattened); + self.nodes[node_id.as_usize()].a = child_slice.start; + self.nodes[node_id.as_usize()].b = child_slice.len; + } + + fn simple( + &self, + node_id: NodeId, + parse_context: &mut CompiledParseContext, + crumbs: Option>, + ) -> Option> { + let cacheable = crumbs.is_none(); + if cacheable && let Some(cached) = parse_context.simple_cache.get(&node_id) { + return cached.clone(); + } + + let result = match self.node(node_id).kind { + Kind::Sequence => { + let mut simple_raws = AHashSet::new(); + let mut simple_types = SyntaxSet::EMPTY; + + for child in self.node_children(node_id) { + let simple = self.simple(*child, parse_context, crumbs.clone())?; + let (raws, types) = simple.as_ref(); + simple_raws.extend(raws.iter().cloned()); + simple_types = simple_types.union(types); + + if !self.is_optional(*child) { + break; + } + } + + Some(Rc::new((simple_raws, simple_types))) + } + Kind::OneOf | Kind::AnyNumberOf | Kind::Delimited => { + let mut simple_raws = AHashSet::new(); + let mut simple_types = SyntaxSet::EMPTY; + + for child in self.node_children(node_id) { + let simple = self.simple(*child, parse_context, crumbs.clone())?; + let (raws, types) = simple.as_ref(); + simple_raws.extend(raws.iter().cloned()); + simple_types = simple_types.union(types); + } + + Some(Rc::new((simple_raws, simple_types))) + } + Kind::Ref => { + let payload = self.ref_payload(node_id); + if let Some(ref c) = crumbs + && c.contains(&payload.symbol) + { + let loop_string = c + .iter() + .map(|id| self.symbol_name(*id)) + .collect_vec() + .join(" -> "); + panic!("Self referential grammar detected: {loop_string}"); + } + + let mut new_crumbs = crumbs.unwrap_or_default(); + new_crumbs.push(payload.symbol); + + self.simple(payload.resolved?, parse_context, Some(new_crumbs)) + } + Kind::String => { + let payload = self.string_payload(node_id); + Some(Rc::new(( + [self.string_value(payload.template).to_owned()].into(), + SyntaxSet::EMPTY, + ))) + } + Kind::MultiString => { + let payload = self.multi_string_payload(node_id); + let raws = self + .kids_slice(payload.templates) + .iter() + .map(|id| self.string_value(id.0).to_owned()) + .collect(); + Some(Rc::new((raws, SyntaxSet::EMPTY))) + } + Kind::Typed => { + let payload = self.typed_payload(node_id); + Some(Rc::new(( + AHashSet::new(), + SyntaxSet::new(&[payload.template]), + ))) + } + Kind::NodeMatcher => { + let payload = self.node_matcher_payload(node_id); + self.simple(payload.child, parse_context, crumbs) + } + Kind::Bracketed => { + let payload = self.bracketed_payload(node_id); + let set = self.symbol_name(payload.bracket_pairs_set); + let target_type = self.symbol_name(payload.bracket_type); + let mut start = None; + + for (bracket_type, start_ref, _end_ref, _persists) in + parse_context.dialect.bracket_sets(set) + { + if bracket_type == target_type + && let Some(definition) = self.get_definition_by_name(start_ref) + { + start = Some(definition); + break; + } + } + + start.and_then(|it| self.simple(it, parse_context, crumbs)) + } + _ => None, + }; + + if cacheable { + parse_context.simple_cache.insert(node_id, result.clone()); + } + + result + } + + fn is_optional(&self, node_id: NodeId) -> bool { + match self.node(node_id).kind { + Kind::Sequence => self.sequence_payload(node_id).optional, + Kind::OneOf | Kind::AnyNumberOf => { + let payload = self.any_number_of_payload(node_id); + payload.optional || payload.min_times == 0 + } + Kind::Ref => self.ref_payload(node_id).optional, + Kind::String => self.string_payload(node_id).optional, + Kind::Typed => self.typed_payload(node_id).optional, + Kind::Delimited => self.delimited_payload(node_id).optional, + Kind::Bracketed => self.is_optional(self.bracketed_payload(node_id).inner), + _ => false, + } + } + + fn match_node( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + parse_context: &mut CompiledParseContext, + ) -> Result { + match self.node(node_id).kind { + Kind::Sequence => self.match_sequence(node_id, segments, idx, parse_context), + Kind::OneOf | Kind::AnyNumberOf => { + self.match_any_number_of(node_id, segments, idx, parse_context) + } + Kind::Ref => self.match_ref(node_id, segments, idx, parse_context), + Kind::NodeMatcher => self.match_node_matcher(node_id, segments, idx, parse_context), + Kind::String => self.match_string(node_id, segments, idx), + Kind::MultiString => self.match_multi_string(node_id, segments, idx), + Kind::Regex => self.match_regex(node_id, segments, idx), + Kind::Typed => self.match_typed(node_id, segments, idx), + Kind::Code => self.match_code(segments, idx), + Kind::NonCode => self.match_non_code(segments, idx), + Kind::Nothing => Ok(MatchResult::empty_at(idx)), + Kind::Anything => self.match_anything(node_id, segments, idx, parse_context), + Kind::Delimited => self.match_delimited(node_id, segments, idx, parse_context), + Kind::Bracketed => self.match_bracketed(node_id, segments, idx, parse_context), + Kind::Meta => panic!("Meta node has no direct match method"), + Kind::Conditional => self.match_conditional(node_id, idx, parse_context), + Kind::BracketedSegmentMatcher => self.match_bracketed_segment(segments, idx), + Kind::LookaheadExclude => self.match_lookahead_exclude(node_id, segments, idx), + } + } + + fn match_sequence( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + mut idx: u32, + parse_context: &mut CompiledParseContext, + ) -> Result { + let payload = self.sequence_payload(node_id); + let children = self.node_children(node_id); + + let start_idx = idx; + let mut matched_idx = idx; + let mut max_idx = segments.len() as u32; + let mut insert_segments = Vec::new(); + let mut child_matches = Vec::new(); + let mut first_match = true; + let mut meta_buffer = Vec::new(); + + if payload.parse_mode == ParseMode::Greedy { + let payload_terminators = self.kids_slice(payload.terminators); + let mut terminators = + Vec::with_capacity(payload_terminators.len() + parse_context.terminators.len()); + terminators.extend_from_slice(payload_terminators); + terminators.extend_from_slice(&parse_context.terminators); + + max_idx = self.trim_to_terminator(segments, idx, &terminators, parse_context)?; + } + + for child in children { + match self.node(*child).kind { + Kind::Conditional => { + let match_result = + self.match_node(*child, segments, matched_idx, parse_context)?; + for (_, submatch) in match_result.insert_segments { + meta_buffer.push(submatch); + } + continue; + } + Kind::Meta => { + meta_buffer.push(self.meta_payload(*child).kind); + continue; + } + _ => {} + } + + idx = if payload.allow_gaps { + skip_start_index_forward_to_code(segments, matched_idx, max_idx) + } else { + matched_idx + }; + + if idx >= max_idx { + if self.is_optional(*child) { + continue; + } + + if payload.parse_mode == ParseMode::Strict || matched_idx == start_idx { + return Ok(MatchResult::empty_at(idx)); + } + + insert_segments.extend(meta_buffer.into_iter().map(|meta| (matched_idx, meta))); + + return Ok(MatchResult { + span: Span { + start: start_idx, + end: matched_idx, + }, + insert_segments, + child_matches, + matched: Some(Matched::SyntaxKind(SyntaxKind::Unparsable)), + }); + } + + let mut elem_match = parse_context.deeper_match(false, &[], |ctx| { + self.match_node(*child, &segments[..max_idx as usize], idx, ctx) + })?; + + if !elem_match.has_match() { + if self.is_optional(*child) { + continue; + } + + if payload.parse_mode == ParseMode::Strict { + return Ok(MatchResult::empty_at(idx)); + } + + if payload.parse_mode == ParseMode::GreedyOnceStarted && matched_idx == start_idx { + return Ok(MatchResult::empty_at(idx)); + } + + if matched_idx == start_idx { + return Ok(MatchResult { + span: Span { + start: start_idx, + end: max_idx, + }, + matched: Some(Matched::SyntaxKind(SyntaxKind::Unparsable)), + ..MatchResult::default() + }); + } + + child_matches.push(MatchResult { + span: Span { + start: skip_start_index_forward_to_code(segments, matched_idx, max_idx), + end: max_idx, + }, + matched: Some(Matched::SyntaxKind(SyntaxKind::Unparsable)), + ..MatchResult::default() + }); + + return Ok(MatchResult { + span: Span { + start: start_idx, + end: max_idx, + }, + insert_segments, + child_matches, + matched: None, + }); + } + + let meta = std::mem::take(&mut meta_buffer); + insert_segments.append(&mut flush_metas(matched_idx, idx, meta)); + + matched_idx = elem_match.span.end; + + if first_match && payload.parse_mode == ParseMode::GreedyOnceStarted { + let payload_terminators = self.kids_slice(payload.terminators); + let mut terminators = + Vec::with_capacity(payload_terminators.len() + parse_context.terminators.len()); + terminators.extend_from_slice(payload_terminators); + terminators.extend_from_slice(&parse_context.terminators); + + max_idx = + self.trim_to_terminator(segments, matched_idx, &terminators, parse_context)?; + first_match = false; + } + + if elem_match.matched.is_some() { + child_matches.push(elem_match); + continue; + } + + child_matches.append(&mut elem_match.child_matches); + insert_segments.append(&mut elem_match.insert_segments); + } + + insert_segments.extend(meta_buffer.into_iter().map(|meta| (matched_idx, meta))); + + if matches!( + payload.parse_mode, + ParseMode::Greedy | ParseMode::GreedyOnceStarted + ) && max_idx > matched_idx + { + let idx = skip_start_index_forward_to_code(segments, matched_idx, max_idx); + let stop_idx = skip_stop_index_backward_to_code(segments, max_idx, idx); + + if stop_idx > idx { + child_matches.push(MatchResult { + span: Span { + start: idx, + end: stop_idx, + }, + matched: Some(Matched::SyntaxKind(SyntaxKind::Unparsable)), + ..Default::default() + }); + matched_idx = stop_idx; + } + } + + Ok(MatchResult { + span: Span { + start: start_idx, + end: matched_idx, + }, + matched: None, + insert_segments, + child_matches, + }) + } + + fn match_any_number_of( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + parse_context: &mut CompiledParseContext, + ) -> Result { + let payload = self.any_number_of_payload(node_id); + let elements = self.node_children(node_id); + + if let Some(exclude) = payload.exclude { + let match_result = parse_context.deeper_match(false, &[], |ctx| { + self.match_node(exclude, segments, idx, ctx) + })?; + + if match_result.has_match() { + return Ok(MatchResult::empty_at(idx)); + } + } + + let mut n_matches = 0; + let mut option_counter: Option> = payload + .max_times_per_element + .map(|_| elements.iter().copied().map(|elem| (elem, 0)).collect()); + let mut matched_idx = idx; + let mut working_idx = idx; + let mut matched = MatchResult::empty_at(idx); + let mut max_idx = segments.len() as u32; + + if payload.parse_mode == ParseMode::Greedy { + let payload_terminators = self.kids_slice(payload.terminators); + let mut terminators = if payload.reset_terminators { + Vec::with_capacity(payload_terminators.len()) + } else { + Vec::with_capacity(payload_terminators.len() + parse_context.terminators.len()) + }; + terminators.extend_from_slice(payload_terminators); + if !payload.reset_terminators { + terminators.extend_from_slice(&parse_context.terminators); + } + + max_idx = self.trim_to_terminator(segments, idx, &terminators, parse_context)?; + } + + loop { + if (n_matches >= payload.min_times && matched_idx >= max_idx) + || payload.max_times.is_some() && Some(n_matches) >= payload.max_times + { + return Ok(parse_mode_match_result( + segments, + matched, + max_idx, + payload.parse_mode, + )); + } + + if matched_idx >= max_idx { + return Ok(MatchResult::empty_at(idx)); + } + + let (match_result, matched_option) = parse_context.deeper_match( + payload.reset_terminators, + self.kids_slice(payload.terminators), + |ctx| self.longest_match(&segments[..max_idx as usize], elements, working_idx, ctx), + )?; + + if !match_result.has_match() { + if n_matches < payload.min_times { + matched = MatchResult::empty_at(idx); + } + + return Ok(parse_mode_match_result( + segments, + matched, + max_idx, + payload.parse_mode, + )); + } + + let matched_option = matched_option.unwrap(); + + if let Some(max_times_per_element) = payload.max_times_per_element + && let Some(counter) = option_counter + .as_mut() + .and_then(|counter| counter.get_mut(&matched_option)) + { + *counter += 1; + + if *counter > max_times_per_element { + return Ok(parse_mode_match_result( + segments, + matched, + max_idx, + payload.parse_mode, + )); + } + } + + matched = matched.append(match_result); + matched_idx = matched.span.end; + working_idx = matched_idx; + if payload.allow_gaps { + working_idx = + skip_start_index_forward_to_code(segments, matched_idx, segments.len() as u32); + } + n_matches += 1; + } + } + + fn match_ref( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + parse_context: &mut CompiledParseContext, + ) -> Result { + let payload = self.ref_payload(node_id); + let Some(elem) = payload.resolved else { + return Err(SQLParseError { + description: format!( + "Grammar refers to '{}' which was not found in the compiled grammar.", + self.symbol_name(payload.symbol) + ), + segment: segments.get(idx as usize).cloned(), + }); + }; + + if let Some(exclude) = payload.exclude { + let ctx = parse_context.deeper_match( + payload.reset_terminators, + self.kids_slice(payload.terminators), + |this| { + if self + .match_node(exclude, segments, idx, this) + .inspect_err(|e| log::error!("Parser error: {e:?}")) + .is_ok_and(|match_result| match_result.has_match()) + { + return Some(MatchResult::empty_at(idx)); + } + + None + }, + ); + + if let Some(ctx) = ctx { + return Ok(ctx); + } + } + + parse_context.deeper_match( + payload.reset_terminators, + self.kids_slice(payload.terminators), + |this| self.match_node(elem, segments, idx, this), + ) + } + + fn match_node_matcher( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + parse_context: &mut CompiledParseContext, + ) -> Result { + if idx >= segments.len() as u32 { + return Ok(MatchResult::empty_at(idx)); + } + + let payload = self.node_matcher_payload(node_id); + + if segments[idx as usize].get_type() == payload.node_kind { + return Ok(MatchResult::from_span(idx, idx + 1)); + } + + let match_result = parse_context.deeper_match(false, &[], |ctx| { + self.match_node(payload.child, segments, idx, ctx) + })?; + + Ok(match_result.wrap(Matched::SyntaxKind(payload.node_kind))) + } + + fn match_string( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + ) -> Result { + let payload = self.string_payload(node_id); + let segment = &segments[idx as usize]; + + if segment.is_code() + && self + .string_value(payload.template) + .eq_ignore_ascii_case(segment.raw()) + { + return Ok(MatchResult { + span: Span { + start: idx, + end: idx + 1, + }, + matched: Some(Matched::Newtype(payload.kind)), + insert_segments: Vec::new(), + child_matches: Vec::new(), + }); + } + + Ok(MatchResult::empty_at(idx)) + } + + fn match_multi_string( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + ) -> Result { + let payload = self.multi_string_payload(node_id); + let segment = &segments[idx as usize]; + + if !segment.is_code() { + return Ok(MatchResult::empty_at(idx)); + } + + let segment_raw = segment.raw(); + + let matched = self.kids_slice(payload.templates).iter().any(|template| { + self.string_value(template.0) + .eq_ignore_ascii_case(segment_raw) + }); + + if matched { + return Ok(MatchResult { + span: Span { + start: idx, + end: idx + 1, + }, + matched: Some(Matched::Newtype(payload.kind)), + ..<_>::default() + }); + } + + Ok(MatchResult::empty_at(idx)) + } + + fn match_regex( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + ) -> Result { + let payload = self.regex_payload(node_id); + let regex = &self.regexes[payload.regex_id as usize]; + let segment = &segments[idx as usize]; + let segment_raw_upper = segment.raw().to_ascii_uppercase(); + + if let Some(result) = regex.regex.find(&segment_raw_upper).ok().flatten() + && result.as_str() == segment_raw_upper.as_str() + && !regex.anti_regex.as_ref().is_some_and(|anti_template| { + anti_template + .is_match(&segment_raw_upper) + .unwrap_or_default() + }) + { + return Ok(MatchResult { + span: Span { + start: idx, + end: idx + 1, + }, + matched: Some(Matched::Newtype(payload.kind)), + insert_segments: Vec::new(), + child_matches: Vec::new(), + }); + } + + Ok(MatchResult::empty_at(idx)) + } + + fn match_typed( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + ) -> Result { + let payload = self.typed_payload(node_id); + let segment = &segments[idx as usize]; + if segment.is_type(payload.template) { + return Ok(MatchResult { + span: Span { + start: idx, + end: idx + 1, + }, + matched: Some(Matched::Newtype(payload.kind)), + insert_segments: Vec::new(), + child_matches: Vec::new(), + }); + } + + Ok(MatchResult::empty_at(idx)) + } + + fn match_code( + &self, + segments: &[ErasedSegment], + idx: u32, + ) -> Result { + if idx as usize >= segments.len() { + return Ok(MatchResult::empty_at(idx)); + } + + if segments[idx as usize].is_code() { + return Ok(MatchResult::from_span(idx, idx + 1)); + } + + Ok(MatchResult::empty_at(idx)) + } + + fn match_non_code( + &self, + segments: &[ErasedSegment], + idx: u32, + ) -> Result { + let mut matched_idx = idx; + + for i in idx..segments.len() as u32 { + if segments[i as usize].is_code() { + matched_idx = i; + break; + } + } + + if matched_idx > idx { + return Ok(MatchResult { + span: Span { + start: idx, + end: matched_idx, + }, + ..Default::default() + }); + } + + Ok(MatchResult::empty_at(idx)) + } + + fn match_anything( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + parse_context: &mut CompiledParseContext, + ) -> Result { + let payload = self.anything_payload(node_id); + if payload.terminators.is_empty() && parse_context.terminators.is_empty() { + return Ok(MatchResult::from_span(idx, segments.len() as u32)); + } + + let mut terminators = self.kids_slice(payload.terminators).to_vec(); + terminators.extend_from_slice(&parse_context.terminators); + + self.greedy_match(segments, idx, parse_context, &terminators, false, true) + } + + fn match_delimited( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + parse_context: &mut CompiledParseContext, + ) -> Result { + let payload = self.delimited_payload(node_id); + let elements = self.node_children(node_id); + + let mut delimiters = 0; + let mut seeking_delimiter = false; + let max_idx = segments.len() as u32; + let mut working_idx = idx; + let mut working_match = MatchResult::empty_at(idx); + let mut delimiter_match = None; + + let delimiter_matcher = payload.delimiter; + + let mut terminator_matchers = self.kids_slice(payload.terminators).to_vec(); + terminator_matchers.extend( + parse_context + .terminators + .iter() + .filter(|&&t| self.node_eq_group(delimiter_matcher) != self.node_eq_group(t)) + .copied(), + ); + + let delimiter_matchers = [payload.delimiter]; + + if !payload.allow_gaps { + terminator_matchers.push(self.builtin_non_code); + } + + loop { + if payload.allow_gaps && working_idx > idx { + working_idx = + skip_start_index_forward_to_code(segments, working_idx, segments.len() as u32); + } + + if working_idx >= max_idx { + break; + } + + let (match_result, _) = parse_context.deeper_match(false, &[], |this| { + self.longest_match(segments, &terminator_matchers, working_idx, this) + })?; + + if match_result.has_match() { + break; + } + + let mut push_terminators: &[NodeId] = &[]; + if !seeking_delimiter { + push_terminators = &delimiter_matchers; + } + + let (match_result, _) = + parse_context.deeper_match(false, push_terminators, |this| { + self.longest_match( + segments, + if seeking_delimiter { + &delimiter_matchers + } else { + elements + }, + working_idx, + this, + ) + })?; + + if !match_result.has_match() { + if seeking_delimiter && payload.optional_delimiter { + seeking_delimiter = false; + continue; + } + break; + } + + working_idx = match_result.span.end; + + if seeking_delimiter { + delimiter_match = Some(match_result); + } else { + if let Some(delimiter_match) = &delimiter_match { + delimiters += 1; + working_match = working_match.append(delimiter_match); + } + working_match = working_match.append(match_result); + } + + seeking_delimiter = !seeking_delimiter; + } + + if let Some(delimiter_match) = + delimiter_match.filter(|_delimiter_match| payload.allow_trailing && !seeking_delimiter) + { + delimiters += 1; + working_match = working_match.append(delimiter_match); + } + + if delimiters < payload.min_delimiters { + return Ok(MatchResult::empty_at(idx)); + } + + Ok(working_match) + } + + fn match_bracketed( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + parse_context: &mut CompiledParseContext, + ) -> Result { + let payload = self.bracketed_payload(node_id); + let set_name = self.symbol_name(payload.bracket_pairs_set); + let target_type = self.symbol_name(payload.bracket_type); + + let Some((start_bracket, end_bracket, bracket_persists)) = parse_context + .dialect + .bracket_sets(set_name) + .into_iter() + .find_map(|(bracket_type, start_ref, end_ref, persists)| { + if bracket_type != target_type { + return None; + } + + Some(( + self.get_definition_by_name(start_ref)?, + self.get_definition_by_name(end_ref)?, + persists, + )) + }) + else { + panic!( + "bracket_type {:?} not found in bracket_pairs ({set_name}) of {:?} dialect.", + target_type, parse_context.dialect.name + ); + }; + + let start_match = parse_context.deeper_match(false, &[], |ctx| { + self.match_node(start_bracket, segments, idx, ctx) + })?; + + if !start_match.has_match() { + return Ok(MatchResult::empty_at(idx)); + } + + let start_match_span = start_match.span; + + let bracketed_match = self.resolve_bracket( + segments, + start_match, + start_bracket, + &[start_bracket], + &[end_bracket], + &[bracket_persists], + parse_context, + false, + )?; + + let mut idx = start_match_span.end; + let mut end_idx = bracketed_match.span.end - 1; + + if payload.allow_gaps { + idx = skip_start_index_forward_to_code(segments, idx, segments.len() as u32); + end_idx = skip_stop_index_backward_to_code(segments, end_idx, idx); + } + + let content_match = parse_context.deeper_match(true, &[end_bracket], |ctx| { + self.match_node(payload.inner, &segments[..end_idx as usize], idx, ctx) + })?; + + if content_match.span.end != end_idx && payload.parse_mode == ParseMode::Strict { + return Ok(MatchResult::empty_at(idx)); + } + + let intermediate_slice = Span { + start: content_match.span.end, + end: bracketed_match.span.end - 1, + }; + + if !payload.allow_gaps && intermediate_slice.start == intermediate_slice.end { + unimplemented!() + } + + let mut child_matches = bracketed_match.child_matches; + if content_match.matched.is_some() { + child_matches.push(content_match); + } else { + child_matches.extend(content_match.child_matches); + } + + Ok(MatchResult { + child_matches, + ..bracketed_match + }) + } + + fn match_conditional( + &self, + node_id: NodeId, + idx: u32, + parse_context: &mut CompiledParseContext, + ) -> Result { + let payload = self.conditional_payload(node_id); + if !parse_context + .indentation_config + .contains(payload.requirements) + { + return Ok(MatchResult::empty_at(idx)); + } + + Ok(MatchResult { + span: Span { + start: idx, + end: idx, + }, + insert_segments: vec![(idx, payload.meta)], + ..Default::default() + }) + } + + fn match_bracketed_segment( + &self, + segments: &[ErasedSegment], + idx: u32, + ) -> Result { + if segments[idx as usize].get_type() == SyntaxKind::Bracketed { + return Ok(MatchResult::from_span(idx, idx + 1)); + } + + Ok(MatchResult::empty_at(idx)) + } + + fn match_lookahead_exclude( + &self, + node_id: NodeId, + segments: &[ErasedSegment], + idx: u32, + ) -> Result { + let payload = self.lookahead_payload(node_id); + + if idx >= segments.len() as u32 { + return Ok(MatchResult::empty_at(idx)); + } + + let current_raw = segments[idx as usize].raw(); + if current_raw.eq_ignore_ascii_case(self.string_value(payload.first_token)) { + let next_idx = + skip_start_index_forward_to_code(segments, idx + 1, segments.len() as u32); + + if next_idx < segments.len() as u32 { + let next_raw = segments[next_idx as usize].raw(); + if next_raw.eq_ignore_ascii_case(self.string_value(payload.lookahead_token)) { + return Ok(MatchResult::from_span(idx, idx + 1)); + } + } + } + + Ok(MatchResult::empty_at(idx)) + } + + #[inline] + fn option_matches_first_token( + &self, + option: NodeId, + parse_context: &mut CompiledParseContext, + first_token: Option<(&str, &SyntaxSet)>, + ) -> bool { + let Some((first_raw, first_types)) = first_token else { + return true; + }; + + let Some(simple) = self.simple(option, parse_context, None) else { + return true; + }; + let (simple_raws, simple_types) = simple.as_ref(); + simple_raws.contains(first_raw) || first_types.intersects(simple_types) + } + + fn longest_match( + &self, + segments: &[ErasedSegment], + matchers: &[NodeId], + idx: u32, + parse_context: &mut CompiledParseContext, + ) -> Result<(MatchResult, Option), SQLParseError> { + let max_idx = segments.len() as u32; + + if matchers.is_empty() || idx == max_idx { + return Ok((MatchResult::empty_at(idx), None)); + } + + let first_token = first_non_whitespace(segments, idx); + let first_token = first_token + .as_ref() + .map(|(first_raw, first_types)| (first_raw.as_str(), *first_types)); + let mut available_options_count = 0; + + for &matcher in matchers { + if self.option_matches_first_token(matcher, parse_context, first_token) { + available_options_count += 1; + } + } + + if available_options_count == 0 { + return Ok((MatchResult::empty_at(idx), None)); + } + + let mut terminators_for_early_break: Option> = None; + let cache_position = segments[idx as usize].get_position_marker().unwrap(); + + let (working_line_no, working_line_pos) = cache_position.working_loc(); + let loc_key = ( + working_line_no, + working_line_pos, + segments[idx as usize].get_type(), + max_idx, + ); + + let loc_key = parse_context.loc_key(loc_key); + + let mut best_match = MatchResult::empty_at(idx); + let mut best_matcher = None; + let mut available_options_seen = 0; + + 'matcher: for &matcher in matchers { + if !self.option_matches_first_token(matcher, parse_context, first_token) { + continue; + } + + available_options_seen += 1; + let matcher_key = self.node_cache_key(matcher); + let res_match = + if let Some(res_match) = parse_context.check_parse_cache(loc_key, matcher_key) { + res_match + } else { + let computed = self.match_node(matcher, segments, idx, parse_context)?; + parse_context.put_parse_cache(loc_key, matcher_key, computed) + }; + + if res_match.has_match() && res_match.span.end == max_idx { + return Ok((res_match.clone(), Some(matcher))); + } + + if res_match.is_better_than(&best_match) { + best_match = res_match.clone(); + best_matcher = Some(matcher); + + if available_options_seen == available_options_count { + break 'matcher; + } else if !parse_context.terminators.is_empty() { + let next_code_idx = skip_start_index_forward_to_code( + segments, + best_match.span.end, + segments.len() as u32, + ); + + if next_code_idx == segments.len() as u32 { + break 'matcher; + } + + let terminators = terminators_for_early_break + .get_or_insert_with(|| parse_context.terminators.clone()); + + for terminator in terminators.iter().copied() { + let terminator_match = + self.match_node(terminator, segments, next_code_idx, parse_context)?; + + if terminator_match.has_match() { + break 'matcher; + } + } + } + } + } + + Ok((best_match, best_matcher)) + } + + fn prepare_next_match( + &self, + matchers: &[NodeId], + parse_context: &mut CompiledParseContext, + ) -> NextMatchPrepared { + let mut raw_simple_map: AHashMap> = AHashMap::new(); + let mut type_simple_map: AHashMap> = AHashMap::new(); + + for (matcher_idx, matcher) in enumerate(matchers) { + let Some(simple) = self.simple(*matcher, parse_context, None) else { + continue; + }; + let (raws, types) = simple.as_ref(); + + raw_simple_map.reserve(raws.len()); + type_simple_map.reserve(types.len()); + + for raw in raws { + raw_simple_map + .entry(raw.clone()) + .or_default() + .push(matcher_idx); + } + + for typ in types { + type_simple_map.entry(typ).or_default().push(matcher_idx); + } + } + + NextMatchPrepared { + type_simple_keys: type_simple_map.keys().copied().collect(), + raw_simple_map, + type_simple_map, + } + } + + fn next_match_with_prepared( + &self, + segments: &[ErasedSegment], + idx: u32, + matchers: &[NodeId], + prepared: &NextMatchPrepared, + scratch: &mut NextMatchScratch, + parse_context: &mut CompiledParseContext, + ) -> Result<(MatchResult, Option), SQLParseError> { + let max_idx = segments.len() as u32; + + if idx >= max_idx { + return Ok((MatchResult::empty_at(idx), None)); + } + + scratch.ensure_matcher_capacity(matchers.len()); + scratch.ensure_segment_capacity(max_idx as usize); + + for scan_idx in idx..max_idx { + let seg_idx = scan_idx as usize; + let seg = &segments[seg_idx]; + scratch.matcher_idxs.clear(); + scratch.visit_stamp = scratch.visit_stamp.wrapping_add(1); + if scratch.visit_stamp == 0 { + scratch.visited[..matchers.len()].fill(0); + scratch.visit_stamp = 1; + } + let visit_stamp = scratch.visit_stamp; + + let seg_addr = seg.addr(); + if scratch.raw_key_addrs[seg_idx] != seg_addr { + scratch.raw_key_addrs[seg_idx] = seg_addr; + scratch.raw_keys[seg_idx] = Some(first_trimmed_raw(seg)); + } + + if let Some(raw_key) = scratch.raw_keys[seg_idx].as_deref() + && let Some(raw_matchers) = prepared.raw_simple_map.get(raw_key) + { + for &matcher_idx in raw_matchers { + if scratch.visited[matcher_idx] != visit_stamp { + scratch.visited[matcher_idx] = visit_stamp; + scratch.matcher_idxs.push(matcher_idx); + } + } + } + + let type_overlap = seg + .class_types() + .clone() + .intersection(&prepared.type_simple_keys); + + for typ in type_overlap { + if let Some(type_matchers) = prepared.type_simple_map.get(&typ) { + for &matcher_idx in type_matchers { + if scratch.visited[matcher_idx] != visit_stamp { + scratch.visited[matcher_idx] = visit_stamp; + scratch.matcher_idxs.push(matcher_idx); + } + } + } + } + + if scratch.matcher_idxs.is_empty() { + continue; + } + + for &matcher_idx in &scratch.matcher_idxs { + let matcher = matchers[matcher_idx]; + let match_result = self.match_node(matcher, segments, scan_idx, parse_context)?; + + if match_result.has_match() { + return Ok((match_result, Some(matcher))); + } + } + } + + Ok((MatchResult::empty_at(idx), None)) + } + + #[allow(clippy::too_many_arguments)] + fn resolve_bracket( + &self, + segments: &[ErasedSegment], + opening_match: MatchResult, + opening_matcher: NodeId, + start_brackets: &[NodeId], + end_brackets: &[NodeId], + bracket_persists: &[bool], + parse_context: &mut CompiledParseContext, + nested_match: bool, + ) -> Result { + let type_idx = start_brackets + .iter() + .position(|it| it == &opening_matcher) + .unwrap(); + let mut matched_idx = opening_match.span.end; + let mut child_matches = vec![opening_match.clone()]; + + let mut matchers = Vec::with_capacity(start_brackets.len() + end_brackets.len()); + matchers.extend_from_slice(start_brackets); + matchers.extend_from_slice(end_brackets); + let prepared = self.prepare_next_match(&matchers, parse_context); + let mut scratch = NextMatchScratch::new(matchers.len()); + + loop { + let (match_result, matcher) = self.next_match_with_prepared( + segments, + matched_idx, + &matchers, + &prepared, + &mut scratch, + parse_context, + )?; + + if !match_result.has_match() { + return Err(SQLParseError { + description: "Couldn't find closing bracket for opening bracket.".into(), + segment: segments[opening_match.span.start as usize].clone().into(), + }); + } + + let matcher = matcher.unwrap(); + if end_brackets.contains(&matcher) { + let closing_idx = end_brackets.iter().position(|it| it == &matcher).unwrap(); + + if closing_idx == type_idx { + let match_span = match_result.span; + let persists = bracket_persists[type_idx]; + let insert_segments = vec![ + (opening_match.span.end, SyntaxKind::Indent), + (match_result.span.start, SyntaxKind::Dedent), + ]; + + child_matches.push(match_result); + let match_result = MatchResult { + span: Span { + start: opening_match.span.start, + end: match_span.end, + }, + matched: None, + insert_segments, + child_matches, + }; + + if !persists { + return Ok(match_result); + } + + return Ok(match_result.wrap(Matched::SyntaxKind(SyntaxKind::Bracketed))); + } + + return Err(SQLParseError { + description: "Found unexpected end bracket!".into(), + segment: segments[(match_result.span.end - 1) as usize] + .clone() + .into(), + }); + } + + let inner_match = self.resolve_bracket( + segments, + match_result, + matcher, + start_brackets, + end_brackets, + bracket_persists, + parse_context, + false, + )?; + + matched_idx = inner_match.span.end; + if nested_match { + child_matches.push(inner_match); + } + } + } + + fn next_ex_bracket_match( + &self, + segments: &[ErasedSegment], + idx: u32, + matchers: &[NodeId], + parse_context: &mut CompiledParseContext, + bracket_data: &NextExBracketPrepared, + scratch: &mut NextMatchScratch, + ) -> BracketMatch { + let max_idx = segments.len() as u32; + + if idx >= max_idx { + return Ok((MatchResult::empty_at(idx), None, Vec::new())); + } + + let mut matched_idx = idx; + let mut child_matches: Vec = Vec::new(); + + loop { + let (match_result, matcher) = self.next_match_with_prepared( + segments, + matched_idx, + &bracket_data.all_matchers, + &bracket_data.next_match_prepared, + scratch, + parse_context, + )?; + if !match_result.has_match() { + return Ok((match_result, matcher, child_matches)); + } + + if let Some(matcher) = matcher + .as_ref() + .filter(|matcher| matchers.contains(matcher)) + { + return Ok((match_result, Some(*matcher), child_matches)); + } + + if matcher + .as_ref() + .is_some_and(|matcher| bracket_data.end_brackets.contains(matcher)) + { + return Ok((MatchResult::empty_at(idx), None, Vec::new())); + } + + let bracket_match = self.resolve_bracket( + segments, + match_result, + matcher.unwrap(), + &bracket_data.start_brackets, + &bracket_data.end_brackets, + &bracket_data.bracket_persists, + parse_context, + true, + )?; + + matched_idx = bracket_match.span.end; + child_matches.push(bracket_match); + } + } + + fn prepare_next_ex_bracket_match( + &self, + matchers: &[NodeId], + parse_context: &mut CompiledParseContext, + bracket_pairs_set: &str, + ) -> NextExBracketPrepared { + let (_, start_bracket_refs, end_bracket_refs, bracket_persists): ( + Vec<_>, + Vec<_>, + Vec<_>, + Vec<_>, + ) = multiunzip(parse_context.dialect.bracket_sets(bracket_pairs_set)); + + let start_brackets = start_bracket_refs + .into_iter() + .filter_map(|seg_ref| self.get_definition_by_name(seg_ref)) + .collect_vec(); + + let end_brackets = end_bracket_refs + .into_iter() + .filter_map(|seg_ref| self.get_definition_by_name(seg_ref)) + .collect_vec(); + + let mut all_matchers = + Vec::with_capacity(matchers.len() + start_brackets.len() + end_brackets.len()); + all_matchers.extend_from_slice(matchers); + all_matchers.extend_from_slice(&start_brackets); + all_matchers.extend_from_slice(&end_brackets); + + let next_match_prepared = self.prepare_next_match(&all_matchers, parse_context); + + NextExBracketPrepared { + start_brackets, + end_brackets, + bracket_persists, + all_matchers, + next_match_prepared, + } + } + + fn greedy_match( + &self, + segments: &[ErasedSegment], + idx: u32, + parse_context: &mut CompiledParseContext, + matchers: &[NodeId], + include_terminator: bool, + nested_match: bool, + ) -> Result { + let mut working_idx = idx; + let mut stop_idx: u32; + let mut child_matches = Vec::new(); + let mut matched: MatchResult; + let bracket_data = + self.prepare_next_ex_bracket_match(matchers, parse_context, "bracket_pairs"); + let mut next_match_scratch = NextMatchScratch::new(bracket_data.all_matchers.len()); + + loop { + let (match_result, matcher, inner_matches) = + parse_context.deeper_match(false, &[], |ctx| { + self.next_ex_bracket_match( + segments, + working_idx, + matchers, + ctx, + &bracket_data, + &mut next_match_scratch, + ) + })?; + + matched = match_result; + + if nested_match { + child_matches.extend(inner_matches); + } + + if !matched.has_match() { + return Ok(MatchResult { + span: Span { + start: idx, + end: segments.len() as u32, + }, + matched: None, + insert_segments: Vec::new(), + child_matches, + }); + } + + let start_idx = matched.span.start; + stop_idx = matched.span.end; + + let matcher = matcher.unwrap(); + let simple = self.simple(matcher, parse_context, None); + + if let Some(simple) = simple { + let (strings, types) = simple.as_ref(); + if types.is_empty() && strings.iter().all(|s| s.chars().all(|c| c.is_alphabetic())) + { + let mut allowable_match = start_idx == working_idx; + + for idx in (working_idx..=start_idx).rev() { + if segments[idx as usize - 1].is_meta() { + continue; + } + + allowable_match = matches!( + segments[idx as usize - 1].get_type(), + SyntaxKind::Whitespace | SyntaxKind::Newline + ); + + break; + } + + if !allowable_match { + working_idx = stop_idx; + continue; + } + } + } + + break; + } + + if include_terminator { + return Ok(MatchResult { + span: Span { + start: idx, + end: stop_idx, + }, + ..MatchResult::default() + }); + } + + let stop_idx = skip_stop_index_backward_to_code(segments, matched.span.start, idx); + + let span = if idx == stop_idx { + Span { + start: idx, + end: matched.span.start, + } + } else { + Span { + start: idx, + end: stop_idx, + } + }; + + Ok(MatchResult { + span, + child_matches, + ..Default::default() + }) + } + + fn trim_to_terminator( + &self, + segments: &[ErasedSegment], + idx: u32, + terminators: &[NodeId], + parse_context: &mut CompiledParseContext, + ) -> Result { + if idx >= segments.len() as u32 { + return Ok(segments.len() as u32); + } + + let first_token = first_non_whitespace(segments, idx); + let first_token = first_token + .as_ref() + .map(|(first_raw, first_types)| (first_raw.as_str(), *first_types)); + + let early_return = parse_context.deeper_match(false, &[], |ctx| { + for &term in terminators { + if !self.option_matches_first_token(term, ctx, first_token) { + continue; + } + if self.match_node(term, segments, idx, ctx)?.has_match() { + return Ok(Some(idx)); + } + } + + Ok(None) + })?; + + if let Some(idx) = early_return { + return Ok(idx); + } + + let term_match = parse_context.deeper_match(false, &[], |ctx| { + self.greedy_match(segments, idx, ctx, terminators, false, false) + })?; + + Ok(skip_stop_index_backward_to_code( + segments, + term_match.span.end, + idx, + )) + } +} + +trait PayloadExt { + fn as_sequence(&self) -> Option<&SequencePayload>; + fn as_any_number_of(&self) -> Option<&AnyNumberOfPayload>; +} + +impl PayloadExt for Payload { + fn as_sequence(&self) -> Option<&SequencePayload> { + match self { + Payload::Sequence(payload) => Some(payload), + _ => None, + } + } + + fn as_any_number_of(&self) -> Option<&AnyNumberOfPayload> { + match self { + Payload::AnyNumberOf(payload) => Some(payload), + _ => None, + } + } +} + +struct LegacyCompiler<'a> { + dialect: &'a Dialect, + grammar: CompiledGrammar, + seen: AHashMap, + eq_representatives: Vec<(Matchable, u32)>, + next_eq_group: u32, +} + +impl LegacyCompiler<'_> { + fn next_eq_group(&mut self) -> u32 { + let group = self.next_eq_group; + self.next_eq_group = self.next_eq_group.saturating_add(1); + group + } + + fn legacy_eq_group(&mut self, matchable: &Matchable) -> u32 { + for (representative, group) in &self.eq_representatives { + if representative == matchable { + return *group; + } + } + + let group = self.next_eq_group(); + self.eq_representatives.push((matchable.clone(), group)); + group + } + + fn compile_many(&mut self, elems: &[Matchable]) -> Result, CompileError> { + elems + .iter() + .map(|elem| self.compile_matchable(elem)) + .collect() + } + + fn compile_slice(&mut self, elems: &[Matchable]) -> Result { + let compiled = self.compile_many(elems)?; + Ok(self.grammar.push_children(compiled)) + } + + fn compile_matchable(&mut self, matchable: &Matchable) -> Result { + let ptr = matchable.ptr() as usize; + if let Some(id) = self.seen.get(&ptr).copied() { + return Ok(id); + } + + let eq_group = self.legacy_eq_group(matchable); + + let id = match matchable.deref() { + MatchableTraitImpl::AnyNumberOf(any) => { + let children = self.compile_many(any.elements())?; + let payload = AnyNumberOfPayload { + exclude: any + .exclude + .as_ref() + .map(|exclude| self.compile_matchable(exclude)) + .transpose()?, + terminators: self.compile_slice(&any.terminators)?, + reset_terminators: any.reset_terminators, + max_times: any.max_times, + min_times: any.min_times, + max_times_per_element: any.max_times_per_element, + allow_gaps: any.allow_gaps, + optional: any.is_optional(), + parse_mode: any.parse_mode.into(), + }; + + let kind = if any.max_times == Some(1) + && any.min_times == 1 + && any.max_times_per_element.is_none() + { + Kind::OneOf + } else { + Kind::AnyNumberOf + }; + + self.grammar.make_any_number_of(kind, children, payload) + } + MatchableTraitImpl::Bracketed(bracketed) => { + let inner_children = self.compile_many(bracketed.elements())?; + let inner_terminators = self.compile_many(&bracketed.terminators)?; + let inner = self.grammar.make_sequence( + inner_children, + bracketed.parse_mode.into(), + bracketed.this.allow_gaps, + bracketed.is_optional(), + inner_terminators, + ); + + let payload = BracketedPayload { + bracket_type: self.grammar.intern_symbol(bracketed.bracket_type), + bracket_pairs_set: self.grammar.intern_symbol(bracketed.bracket_pairs_set), + allow_gaps: bracketed.outer_allow_gaps(), + parse_mode: bracketed.parse_mode.into(), + inner, + }; + + self.grammar + .push_node(Kind::Bracketed, inner.0, 0, Payload::Bracketed(payload)) + } + MatchableTraitImpl::NodeMatcher(node_matcher) => { + let child = self.compile_matchable(&node_matcher.match_grammar(self.dialect))?; + let payload = NodeMatcherPayload { + node_kind: node_matcher.get_type(), + child, + }; + + self.grammar.push_node( + Kind::NodeMatcher, + payload.node_kind as u32, + payload.child.0, + Payload::NodeMatcher(payload), + ) + } + MatchableTraitImpl::NonCodeMatcher(_) => self.grammar.non_code(), + MatchableTraitImpl::Nothing(_) => self.grammar.nothing(), + MatchableTraitImpl::Ref(r#ref) => { + let symbol = self.grammar.intern_symbol(r#ref.reference()); + let payload = RefPayload { + symbol, + exclude: r#ref + .exclude + .as_ref() + .map(|exclude| self.compile_matchable(exclude)) + .transpose()?, + terminators: self.compile_slice(r#ref.terminators_slice())?, + reset_terminators: r#ref.reset_terminators_flag(), + optional: r#ref.is_optional(), + resolved: None, + }; + + self.grammar + .push_node(Kind::Ref, symbol, 0, Payload::Ref(payload)) + } + MatchableTraitImpl::Sequence(sequence) => { + let sequence_children = self.compile_many(sequence.elements())?; + let sequence_terminators = self.compile_many(&sequence.terminators)?; + self.grammar.make_sequence( + sequence_children, + sequence.parse_mode.into(), + sequence.allow_gaps, + sequence.is_optional(), + sequence_terminators, + ) + } + MatchableTraitImpl::StringParser(parser) => { + let payload = StringPayload { + template: self.grammar.intern_string(parser.template()), + kind: parser.kind(), + optional: parser.is_optional(), + }; + + self.grammar.push_node( + Kind::String, + payload.template, + payload.kind as u32, + Payload::String(payload), + ) + } + MatchableTraitImpl::TypedParser(parser) => { + let payload = TypedPayload { + template: parser.template(), + kind: parser.kind(), + optional: parser.is_optional(), + }; + + self.grammar.push_node( + Kind::Typed, + payload.template as u32, + payload.kind as u32, + Payload::Typed(payload), + ) + } + MatchableTraitImpl::CodeParser(_) => self.grammar.code(), + MatchableTraitImpl::MetaSegment(meta) => self.grammar.push_node( + Kind::Meta, + meta.kind as u32, + 0, + Payload::Meta(MetaPayload { kind: meta.kind }), + ), + MatchableTraitImpl::MultiStringParser(parser) => { + let template_ids = parser + .templates() + .iter() + .map(|it| NodeId(self.grammar.intern_string(it))) + .collect_vec(); + let template_slice = self.grammar.push_children(template_ids); + let payload = MultiStringPayload { + templates: template_slice, + kind: parser.kind(), + }; + + self.grammar.push_node( + Kind::MultiString, + template_slice.start, + template_slice.len, + Payload::MultiString(payload), + ) + } + MatchableTraitImpl::RegexParser(parser) => { + let pattern = parser.template.as_str(); + let anti_pattern = parser.anti_template.as_ref().map(|it| it.as_str()); + let regex_id = self.grammar.intern_regex(pattern, anti_pattern); + let payload = RegexPayload { + regex_id, + kind: parser.kind(), + }; + + self.grammar.push_node( + Kind::Regex, + regex_id, + payload.kind as u32, + Payload::Regex(payload), + ) + } + MatchableTraitImpl::Delimited(delimited) => { + let delimited_children = self.compile_many(delimited.elements())?; + let child_slice = self.grammar.push_children(delimited_children); + let payload = DelimitedPayload { + allow_trailing: delimited.allow_trailing, + delimiter: self.compile_matchable(&delimited.delimiter)?, + min_delimiters: delimited.min_delimiters, + optional_delimiter: delimited.optional_delimiter, + optional: delimited.is_optional(), + allow_gaps: delimited.allow_gaps, + terminators: self.compile_slice(&delimited.terminators)?, + }; + + self.grammar.push_node( + Kind::Delimited, + child_slice.start, + child_slice.len, + Payload::Delimited(payload), + ) + } + MatchableTraitImpl::Anything(anything) => { + let payload = AnythingPayload { + terminators: self.compile_slice(anything.terminators_slice())?, + }; + self.grammar + .push_node(Kind::Anything, 0, 0, Payload::Anything(payload)) + } + MatchableTraitImpl::Conditional(conditional) => { + let payload = ConditionalPayload { + meta: conditional.meta_kind(), + requirements: conditional.requirements(), + }; + + self.grammar.push_node( + Kind::Conditional, + payload.meta as u32, + 0, + Payload::Conditional(payload), + ) + } + MatchableTraitImpl::BracketedSegmentMatcher(_) => { + self.grammar + .push_node(Kind::BracketedSegmentMatcher, 0, 0, Payload::None) + } + MatchableTraitImpl::LookaheadExclude(lookahead) => { + let payload = LookaheadExcludePayload { + first_token: self.grammar.intern_string(lookahead.first_token()), + lookahead_token: self.grammar.intern_string(lookahead.lookahead_token()), + }; + + self.grammar.push_node( + Kind::LookaheadExclude, + payload.first_token, + payload.lookahead_token, + Payload::LookaheadExclude(payload), + ) + } + }; + + self.grammar.set_node_eq_group(id, eq_group); + self.seen.insert(ptr, id); + Ok(id) + } +} + +fn parse_mode_match_result( + segments: &[ErasedSegment], + current_match: MatchResult, + max_idx: u32, + parse_mode: ParseMode, +) -> MatchResult { + if parse_mode == ParseMode::Strict { + return current_match; + } + + let stop_idx = current_match.span.end; + if stop_idx == max_idx + || segments[stop_idx as usize..max_idx as usize] + .iter() + .all(|it| !it.is_code()) + { + return current_match; + } + + let trim_idx = skip_start_index_forward_to_code(segments, stop_idx, segments.len() as u32); + + let unmatched_match = MatchResult { + span: Span { + start: trim_idx, + end: max_idx, + }, + matched: Some(Matched::SyntaxKind(SyntaxKind::Unparsable)), + ..MatchResult::default() + }; + + current_match.append(unmatched_match) +} + +fn flush_metas( + tpre_nc_idx: u32, + post_nc_idx: u32, + meta_buffer: Vec, +) -> Vec<(u32, SyntaxKind)> { + let meta_idx = if meta_buffer.iter().all(|it| it.indent_val() >= 0) { + tpre_nc_idx + } else { + post_nc_idx + }; + meta_buffer.into_iter().map(|it| (meta_idx, it)).collect() +} + +impl SQLParseError { + fn new(description: impl Into) -> Self { + Self { + description: description.into(), + segment: None, + } + } +} + +pub type Grammar = CompiledGrammar; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_builder_define_and_compile_resolves_refs() { + let mut g = CompiledGrammar::new(); + let div = g.keyword("DIV"); + let div_op = g.node_matcher(SyntaxKind::BinaryOperator, div); + g.define("DivBinaryOperatorSegment", div_op); + + let plus = g.ref_("PlusSegment"); + let minus = g.ref_("MinusSegment"); + let div_ref = g.ref_("DivBinaryOperatorSegment"); + let arith = g.one_of([plus, minus, div_ref]); + g.define("ArithmeticBinaryOperatorGrammar", arith); + + let plus_kw = g.keyword("+"); + let minus_kw = g.keyword("-"); + g.define("PlusSegment", plus_kw); + g.define("MinusSegment", minus_kw); + + let g = g.compile().unwrap(); + assert!(g.root("ArithmeticBinaryOperatorGrammar").is_some()); + } +} diff --git a/crates/lib-core/src/parser/context.rs b/crates/lib-core/src/parser/context.rs deleted file mode 100644 index 300a33deb..000000000 --- a/crates/lib-core/src/parser/context.rs +++ /dev/null @@ -1,139 +0,0 @@ -use rustc_hash::FxHashMap; -use smol_str::SmolStr; - -use super::match_result::MatchResult; -use super::matchable::{Matchable, MatchableCacheKey}; -use crate::dialects::Dialect; -use crate::dialects::syntax::SyntaxKind; -use crate::helpers::IndexSet; -use crate::parser::IndentationConfig; -use crate::parser::Parser; - -type LocKey = u32; -type LocKeyData = (SmolStr, (usize, usize), SyntaxKind, u32); - -#[derive(Debug, PartialEq, Eq, Hash)] -pub struct CacheKey { - loc: LocKey, - key: MatchableCacheKey, -} - -impl CacheKey { - pub fn new(loc: LocKey, key: MatchableCacheKey) -> Self { - Self { loc, key } - } -} - -#[derive(Debug)] -pub struct ParseContext<'a> { - dialect: &'a Dialect, - pub(crate) terminators: Vec, - loc_keys: IndexSet, - parse_cache: FxHashMap, - pub(crate) indentation_config: IndentationConfig, -} - -impl<'a> From<&'a Parser<'a>> for ParseContext<'a> { - fn from(parser: &'a Parser) -> Self { - let dialect = parser.dialect(); - let indentation_config = parser.indentation_config; - Self::new(dialect, indentation_config) - } -} - -impl<'a> ParseContext<'a> { - pub fn new(dialect: &'a Dialect, indentation_config: IndentationConfig) -> Self { - Self { - dialect, - terminators: Vec::new(), - loc_keys: IndexSet::default(), - parse_cache: FxHashMap::default(), - indentation_config, - } - } - - pub fn dialect(&self) -> &Dialect { - self.dialect - } - - pub(crate) fn deeper_match( - &mut self, - clear_terminators: bool, - push_terminators: &[Matchable], - f: impl FnOnce(&mut Self) -> T, - ) -> T { - let (appended, terms) = self.set_terminators(clear_terminators, push_terminators); - - let ret = f(self); - self.reset_terminators(appended, terms, clear_terminators); - - ret - } - - fn set_terminators( - &mut self, - clear_terminators: bool, - push_terminators: &[Matchable], - ) -> (usize, Vec) { - let mut appended = 0; - let terminators = self.terminators.clone(); - - if clear_terminators && !self.terminators.is_empty() { - self.terminators = if !push_terminators.is_empty() { - push_terminators.to_vec() - } else { - Vec::new() - }; - } else if !push_terminators.is_empty() { - for terminator in push_terminators { - let terminator_owned = terminator.clone(); - - if !self.terminators.contains(terminator) { - self.terminators.push(terminator_owned); - appended += 1; - } - } - } - - (appended, terminators) - } - - fn reset_terminators( - &mut self, - appended: usize, - terminators: Vec, - clear_terminators: bool, - ) { - if clear_terminators { - self.terminators = terminators; - } else { - let new_len = self.terminators.len().saturating_sub(appended); - self.terminators.truncate(new_len); - } - } - - pub(crate) fn loc_key(&mut self, data: LocKeyData) -> LocKey { - let (key, _) = self.loc_keys.insert_full(data); - key as u32 - } - - pub(crate) fn check_parse_cache( - &self, - loc_key: LocKey, - matcher_key: MatchableCacheKey, - ) -> Option { - self.parse_cache - .get(&CacheKey::new(loc_key, matcher_key)) - .cloned() - } - - pub(crate) fn put_parse_cache( - &mut self, - loc_key: LocKey, - matcher_key: MatchableCacheKey, - match_result: MatchResult, - ) { - self.parse_cache - .insert(CacheKey::new(loc_key, matcher_key), match_result); - } -} diff --git a/crates/lib-core/src/parser/grammar.rs b/crates/lib-core/src/parser/grammar.rs index 7866271a4..3c61e76cb 100644 --- a/crates/lib-core/src/parser/grammar.rs +++ b/crates/lib-core/src/parser/grammar.rs @@ -8,16 +8,12 @@ use ahash::AHashSet; use std::borrow::Cow; use std::sync::OnceLock; +use crate::dialects::Dialect; use crate::dialects::syntax::SyntaxSet; -use crate::errors::SQLParseError; use crate::helpers::ToMatchable; -use crate::parser::context::ParseContext; -use crate::parser::match_algorithms::greedy_match; -use crate::parser::match_result::MatchResult; use crate::parser::matchable::{ Matchable, MatchableCacheKey, MatchableTrait, next_matchable_cache_key, }; -use crate::parser::segments::ErasedSegment; #[derive(Clone)] pub struct Ref { @@ -77,6 +73,18 @@ impl Ref { self } + pub(crate) fn reference(&self) -> &str { + &self.reference + } + + pub(crate) fn terminators_slice(&self) -> &[Matchable] { + &self.terminators + } + + pub(crate) fn reset_terminators_flag(&self) -> bool { + self.reset_terminators + } + // Static method to create a Ref instance for a keyword #[track_caller] pub fn keyword(keyword: impl Into>) -> Self { @@ -113,7 +121,7 @@ impl MatchableTrait for Ref { fn simple( &self, - parse_context: &ParseContext, + dialect: &Dialect, crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { self.simple_cache @@ -128,46 +136,13 @@ impl MatchableTrait for Ref { let mut new_crumbs = crumbs.unwrap_or_default(); new_crumbs.push(&self.reference); - parse_context - .dialect() + dialect .r#ref(&self.reference) - .simple(parse_context, Some(new_crumbs)) + .simple(dialect, Some(new_crumbs)) }) .clone() } - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - parse_context: &mut ParseContext, - ) -> Result { - let elem = parse_context.dialect().r#ref(&self.reference); - - if let Some(exclude) = &self.exclude { - let ctx = - parse_context.deeper_match(self.reset_terminators, &self.terminators, |this| { - if exclude - .match_segments(segments, idx, this) - .inspect_err(|e| log::error!("Parser error: {e:?}")) - .is_ok_and(|match_result| match_result.has_match()) - { - return Some(MatchResult::empty_at(idx)); - } - - None - }); - - if let Some(ctx) = ctx { - return Ok(ctx); - } - } - - parse_context.deeper_match(self.reset_terminators, &self.terminators, |this| { - elem.match_segments(segments, idx, this) - }) - } - fn cache_key(&self) -> MatchableCacheKey { self.cache_key } @@ -204,6 +179,10 @@ impl Anything { self.terminators = terminators; self } + + pub(crate) fn terminators_slice(&self) -> &[Matchable] { + &self.terminators + } } impl MatchableTrait for Anything { @@ -211,22 +190,6 @@ impl MatchableTrait for Anything { &[] } - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - parse_context: &mut ParseContext, - ) -> Result { - if self.terminators.is_empty() && parse_context.terminators.is_empty() { - return Ok(MatchResult::from_span(idx, segments.len() as u32)); - } - - let mut terminators = self.terminators.clone(); - terminators.extend_from_slice(&parse_context.terminators); - - greedy_match(segments, idx, parse_context, &terminators, false, true) - } - fn cache_key(&self) -> MatchableCacheKey { self.cache_key } @@ -251,13 +214,4 @@ impl MatchableTrait for Nothing { fn elements(&self) -> &[Matchable] { &[] } - - fn match_segments( - &self, - _segments: &[ErasedSegment], - idx: u32, - _parse_context: &mut ParseContext, - ) -> Result { - Ok(MatchResult::empty_at(idx)) - } } diff --git a/crates/lib-core/src/parser/grammar/anyof.rs b/crates/lib-core/src/parser/grammar/anyof.rs index d033c9e37..7a057704f 100644 --- a/crates/lib-core/src/parser/grammar/anyof.rs +++ b/crates/lib-core/src/parser/grammar/anyof.rs @@ -1,63 +1,22 @@ use ahash::AHashSet; -use itertools::{Itertools, chain}; -use nohash_hasher::IntMap; use super::sequence::{Bracketed, Sequence}; -use crate::dialects::syntax::{SyntaxKind, SyntaxSet}; -use crate::errors::SQLParseError; +use crate::dialects::Dialect; +use crate::dialects::syntax::SyntaxSet; use crate::helpers::ToMatchable; -use crate::parser::context::ParseContext; -use crate::parser::match_algorithms::{ - longest_match, skip_start_index_forward_to_code, trim_to_terminator, -}; -use crate::parser::match_result::{MatchResult, Matched, Span}; use crate::parser::matchable::{ Matchable, MatchableCacheKey, MatchableTrait, next_matchable_cache_key, }; -use crate::parser::segments::ErasedSegment; use crate::parser::types::ParseMode; -fn parse_mode_match_result( - segments: &[ErasedSegment], - current_match: MatchResult, - max_idx: u32, - parse_mode: ParseMode, -) -> MatchResult { - if parse_mode == ParseMode::Strict { - return current_match; - } - - let stop_idx = current_match.span.end; - if stop_idx == max_idx - || segments[stop_idx as usize..max_idx as usize] - .iter() - .all(|it| !it.is_code()) - { - return current_match; - } - - let trim_idx = skip_start_index_forward_to_code(segments, stop_idx, segments.len() as u32); - - let unmatched_match = MatchResult { - span: Span { - start: trim_idx, - end: max_idx, - }, - matched: Matched::SyntaxKind(SyntaxKind::Unparsable).into(), - ..MatchResult::default() - }; - - current_match.append(unmatched_match) -} - pub fn simple( elements: &[Matchable], - parse_context: &ParseContext, + dialect: &Dialect, crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { let option_simples: Vec, SyntaxSet)>> = elements .iter() - .map(|opt| opt.simple(parse_context, crumbs.clone())) + .map(|opt| opt.simple(dialect, crumbs.clone())) .collect(); if option_simples.iter().any(Option::is_none) { @@ -150,114 +109,10 @@ impl MatchableTrait for AnyNumberOf { fn simple( &self, - parse_context: &ParseContext, + dialect: &Dialect, crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { - simple(&self.elements, parse_context, crumbs) - } - - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - parse_context: &mut ParseContext, - ) -> Result { - if let Some(exclude) = &self.exclude { - let match_result = parse_context - .deeper_match(false, &[], |ctx| exclude.match_segments(segments, idx, ctx))?; - - if match_result.has_match() { - return Ok(MatchResult::empty_at(idx)); - } - } - - let mut n_matches = 0; - let mut option_counter: IntMap<_, usize> = self - .elements - .iter() - .map(|elem| (elem.cache_key(), 0)) - .collect(); - let mut matched_idx = idx; - let mut working_idx = idx; - let mut matched = MatchResult::empty_at(idx); - let mut max_idx = segments.len() as u32; - - if self.parse_mode == ParseMode::Greedy { - let terminators = if self.reset_terminators { - self.terminators.clone() - } else { - chain(self.terminators.clone(), parse_context.terminators.clone()).collect_vec() - }; - max_idx = trim_to_terminator(segments, idx, &terminators, parse_context)?; - } - - loop { - if (n_matches >= self.min_times && matched_idx >= max_idx) - || self.max_times.is_some() && Some(n_matches) >= self.max_times - { - return Ok(parse_mode_match_result( - segments, - matched, - max_idx, - self.parse_mode, - )); - } - - if matched_idx >= max_idx { - return Ok(MatchResult::empty_at(idx)); - } - - let (match_result, matched_option) = - parse_context.deeper_match(self.reset_terminators, &self.terminators, |ctx| { - longest_match( - &segments[..max_idx as usize], - &self.elements, - working_idx, - ctx, - ) - })?; - - if !match_result.has_match() { - if n_matches < self.min_times { - matched = MatchResult::empty_at(idx); - } - - return Ok(parse_mode_match_result( - segments, - matched, - max_idx, - self.parse_mode, - )); - } - - let matched_option = matched_option.unwrap(); - let matched_key = matched_option.cache_key(); - - if let Some(counter) = option_counter.get_mut(&matched_key) { - *counter += 1; - - if self - .max_times_per_element - .is_some_and(|max_times_per_element| *counter > max_times_per_element) - { - return Ok(parse_mode_match_result( - segments, - matched, - max_idx, - self.parse_mode, - )); - } - } - - matched = matched.append(match_result); - matched_idx = matched.span.end; - working_idx = matched_idx; - if self.allow_gaps { - working_idx = - skip_start_index_forward_to_code(segments, matched_idx, segments.len() as u32); - } - n_matches += 1; - } + simple(&self.elements, dialect, crumbs) } fn cache_key(&self) -> MatchableCacheKey { diff --git a/crates/lib-core/src/parser/grammar/conditional.rs b/crates/lib-core/src/parser/grammar/conditional.rs index 01ad3783f..16a7ff736 100644 --- a/crates/lib-core/src/parser/grammar/conditional.rs +++ b/crates/lib-core/src/parser/grammar/conditional.rs @@ -1,9 +1,5 @@ -use crate::errors::SQLParseError; use crate::parser::IndentationConfig; -use crate::parser::context::ParseContext; -use crate::parser::match_result::{MatchResult, Span}; use crate::parser::matchable::{Matchable, MatchableTrait}; -use crate::parser::segments::ErasedSegment; use crate::parser::segments::meta::Indent; #[derive(Clone, Debug, PartialEq)] @@ -53,8 +49,12 @@ impl Conditional { self.require(IndentationConfig::INDENTED_JOINS_ON) } - fn is_enabled(&self, parse_context: &ParseContext) -> bool { - parse_context.indentation_config.contains(self.requirements) + pub(crate) fn meta_kind(&self) -> crate::dialects::syntax::SyntaxKind { + self.meta.kind + } + + pub(crate) fn requirements(&self) -> IndentationConfig { + self.requirements } } @@ -62,24 +62,4 @@ impl MatchableTrait for Conditional { fn elements(&self) -> &[Matchable] { &[] } - - fn match_segments( - &self, - _segments: &[ErasedSegment], - idx: u32, - parse_context: &mut ParseContext, - ) -> Result { - if !self.is_enabled(parse_context) { - return Ok(MatchResult::empty_at(idx)); - } - - Ok(MatchResult { - span: Span { - start: idx, - end: idx, - }, - insert_segments: vec![(idx, self.meta.kind)], - ..Default::default() - }) - } } diff --git a/crates/lib-core/src/parser/grammar/delimited.rs b/crates/lib-core/src/parser/grammar/delimited.rs index 5395b59d0..c6c144e1d 100644 --- a/crates/lib-core/src/parser/grammar/delimited.rs +++ b/crates/lib-core/src/parser/grammar/delimited.rs @@ -3,18 +3,13 @@ use std::ops::{Deref, DerefMut}; use ahash::AHashSet; use super::anyof::{AnyNumberOf, one_of}; +use crate::dialects::Dialect; use crate::dialects::syntax::SyntaxSet; -use crate::errors::SQLParseError; use crate::helpers::ToMatchable; -use crate::parser::context::ParseContext; use crate::parser::grammar::Ref; -use crate::parser::grammar::noncode::NonCodeMatcher; -use crate::parser::match_algorithms::{longest_match, skip_start_index_forward_to_code}; -use crate::parser::match_result::MatchResult; use crate::parser::matchable::{ Matchable, MatchableCacheKey, MatchableTrait, next_matchable_cache_key, }; -use crate::parser::segments::ErasedSegment; /// Match an arbitrary number of elements separated by a delimiter. /// @@ -77,119 +72,10 @@ impl MatchableTrait for Delimited { fn simple( &self, - parse_context: &ParseContext, + dialect: &Dialect, crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { - super::anyof::simple(&self.elements, parse_context, crumbs) - } - - /// Match an arbitrary number of elements separated by a delimiter. - /// - /// Note that if there are multiple elements passed in that they will be - /// treated as different options of what can be delimited, rather than a - /// sequence. - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - parse_context: &mut ParseContext, - ) -> Result { - let mut delimiters = 0; - let mut seeking_delimiter = false; - let max_idx = segments.len() as u32; - let mut working_idx = idx; - let mut working_match = MatchResult::empty_at(idx); - let mut delimiter_match = None; - - let delimiter_matcher = self.delimiter.clone(); - - let mut terminator_matchers = self.terminators.clone(); - terminator_matchers.extend( - parse_context - .terminators - .iter() - .filter(|&t| &delimiter_matcher != t) - .cloned(), - ); - - let delimiter_matchers = std::slice::from_ref(&self.delimiter); - - if !self.allow_gaps { - terminator_matchers.push(NonCodeMatcher.to_matchable()); - } - - loop { - if self.allow_gaps && working_idx > idx { - working_idx = - skip_start_index_forward_to_code(segments, working_idx, segments.len() as u32); - } - - if working_idx >= max_idx { - break; - } - - let (match_result, _) = parse_context.deeper_match(false, &[], |this| { - longest_match(segments, &terminator_matchers, working_idx, this) - })?; - - if match_result.has_match() { - break; - } - - let mut push_terminators: &[_] = &[]; - if !seeking_delimiter { - push_terminators = delimiter_matchers; - } - - let (match_result, _) = - parse_context.deeper_match(false, push_terminators, |this| { - longest_match( - segments, - if seeking_delimiter { - delimiter_matchers - } else { - &self.elements - }, - working_idx, - this, - ) - })?; - - if !match_result.has_match() { - if seeking_delimiter && self.optional_delimiter { - seeking_delimiter = false; - continue; - } - break; - } - - working_idx = match_result.span.end; - - if seeking_delimiter { - delimiter_match = match_result.into(); - } else { - if let Some(delimiter_match) = &delimiter_match { - delimiters += 1; - working_match = working_match.append(delimiter_match); - } - working_match = working_match.append(match_result); - } - - seeking_delimiter = !seeking_delimiter; - } - - if let Some(delimiter_match) = - delimiter_match.filter(|_delimiter_match| self.allow_trailing && !seeking_delimiter) - { - delimiters += 1; - working_match = working_match.append(delimiter_match); - } - - if delimiters < self.min_delimiters { - return Ok(MatchResult::empty_at(idx)); - } - - Ok(working_match) + super::anyof::simple(&self.elements, dialect, crumbs) } fn cache_key(&self) -> MatchableCacheKey { diff --git a/crates/lib-core/src/parser/grammar/noncode.rs b/crates/lib-core/src/parser/grammar/noncode.rs index c4f839a2f..b5e3c33b8 100644 --- a/crates/lib-core/src/parser/grammar/noncode.rs +++ b/crates/lib-core/src/parser/grammar/noncode.rs @@ -1,11 +1,8 @@ use ahash::AHashSet; +use crate::dialects::Dialect; use crate::dialects::syntax::SyntaxSet; -use crate::errors::SQLParseError; -use crate::parser::context::ParseContext; -use crate::parser::match_result::{MatchResult, Span}; use crate::parser::matchable::{Matchable, MatchableCacheKey, MatchableTrait}; -use crate::parser::segments::ErasedSegment; #[derive(Debug, Clone, PartialEq)] pub struct NonCodeMatcher; @@ -22,40 +19,12 @@ impl MatchableTrait for NonCodeMatcher { fn simple( &self, - _parse_context: &ParseContext, + _dialect: &Dialect, _crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { None } - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - _parse_context: &mut ParseContext, - ) -> Result { - let mut matched_idx = idx; - - for i in idx..segments.len() as u32 { - if segments[i as usize].is_code() { - matched_idx = i; - break; - } - } - - if matched_idx > idx { - return Ok(MatchResult { - span: Span { - start: idx, - end: matched_idx, - }, - ..Default::default() - }); - } - - Ok(MatchResult::empty_at(idx)) - } - fn cache_key(&self) -> MatchableCacheKey { 0 } diff --git a/crates/lib-core/src/parser/grammar/sequence.rs b/crates/lib-core/src/parser/grammar/sequence.rs index 66a983094..296fa5514 100644 --- a/crates/lib-core/src/parser/grammar/sequence.rs +++ b/crates/lib-core/src/parser/grammar/sequence.rs @@ -3,35 +3,14 @@ use std::ops::{Deref, DerefMut}; use ahash::AHashSet; -use crate::dialects::syntax::{SyntaxKind, SyntaxSet}; -use crate::errors::SQLParseError; +use crate::dialects::Dialect; +use crate::dialects::syntax::SyntaxSet; use crate::helpers::ToMatchable; -use crate::parser::context::ParseContext; -use crate::parser::match_algorithms::{ - resolve_bracket, skip_start_index_forward_to_code, skip_stop_index_backward_to_code, - trim_to_terminator, -}; -use crate::parser::match_result::{MatchResult, Matched, Span}; use crate::parser::matchable::{ Matchable, MatchableCacheKey, MatchableTrait, next_matchable_cache_key, }; -use crate::parser::segments::ErasedSegment; use crate::parser::types::ParseMode; -fn flush_metas( - tpre_nc_idx: u32, - post_nc_idx: u32, - meta_buffer: Vec, - _segments: &[ErasedSegment], -) -> Vec<(u32, SyntaxKind)> { - let meta_idx = if meta_buffer.iter().all(|it| it.indent_val() >= 0) { - tpre_nc_idx - } else { - post_nc_idx - }; - meta_buffer.into_iter().map(|it| (meta_idx, it)).collect() -} - #[derive(Debug, Clone)] pub struct Sequence { elements: Vec, @@ -96,14 +75,14 @@ impl MatchableTrait for Sequence { fn simple( &self, - parse_context: &ParseContext, + dialect: &Dialect, crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { let mut simple_raws = AHashSet::new(); let mut simple_types = SyntaxSet::EMPTY; for opt in &self.elements { - let (raws, types) = opt.simple(parse_context, crumbs.clone())?; + let (raws, types) = opt.simple(dialect, crumbs.clone())?; simple_raws.extend(raws); simple_types.extend(types); @@ -116,170 +95,6 @@ impl MatchableTrait for Sequence { (simple_raws, simple_types).into() } - fn match_segments( - &self, - segments: &[ErasedSegment], - mut idx: u32, - parse_context: &mut ParseContext, - ) -> Result { - let start_idx = idx; - let mut matched_idx = idx; - let mut max_idx = segments.len() as u32; - let mut insert_segments = Vec::new(); - let mut child_matches = Vec::new(); - let mut first_match = true; - let mut meta_buffer = Vec::new(); - - if self.parse_mode == ParseMode::Greedy { - let terminators = - [self.terminators.clone(), parse_context.terminators.clone()].concat(); - - max_idx = trim_to_terminator(segments, idx, &terminators, parse_context)?; - } - - for elem in &self.elements { - if let Some(indent) = elem.as_conditional() { - let match_result = indent.match_segments(segments, matched_idx, parse_context)?; - for (_, submatch) in match_result.insert_segments { - meta_buffer.push(submatch); - } - continue; - } else if let Some(indent) = elem.as_indent() { - meta_buffer.push(indent.kind); - continue; - } - - idx = if self.allow_gaps { - skip_start_index_forward_to_code(segments, matched_idx, max_idx) - } else { - matched_idx - }; - - if idx >= max_idx { - if elem.is_optional() { - continue; - } - - if self.parse_mode == ParseMode::Strict || matched_idx == start_idx { - return Ok(MatchResult::empty_at(idx)); - } - - insert_segments.extend(meta_buffer.into_iter().map(|meta| (matched_idx, meta))); - - return Ok(MatchResult { - span: Span { - start: start_idx, - end: matched_idx, - }, - insert_segments, - child_matches, - matched: Matched::SyntaxKind(SyntaxKind::Unparsable).into(), - }); - } - - let mut elem_match = parse_context.deeper_match(false, &[], |ctx| { - elem.match_segments(&segments[..max_idx as usize], idx, ctx) - })?; - - if !elem_match.has_match() { - if elem.is_optional() { - continue; - } - - if self.parse_mode == ParseMode::Strict { - return Ok(MatchResult::empty_at(idx)); - } - - if self.parse_mode == ParseMode::GreedyOnceStarted && matched_idx == start_idx { - return Ok(MatchResult::empty_at(idx)); - } - - if matched_idx == start_idx { - return Ok(MatchResult { - span: Span { - start: start_idx, - end: max_idx, - }, - matched: Matched::SyntaxKind(SyntaxKind::Unparsable).into(), - ..MatchResult::default() - }); - } - - child_matches.push(MatchResult { - span: Span { - start: skip_start_index_forward_to_code(segments, matched_idx, max_idx), - end: max_idx, - }, - matched: Matched::SyntaxKind(SyntaxKind::Unparsable).into(), - ..MatchResult::default() - }); - - return Ok(MatchResult { - span: Span { - start: start_idx, - end: max_idx, - }, - insert_segments, - child_matches, - matched: None, - }); - } - - let meta_buffer = std::mem::take(&mut meta_buffer); - insert_segments.append(&mut flush_metas(matched_idx, idx, meta_buffer, segments)); - - matched_idx = elem_match.span.end; - - if first_match && self.parse_mode == ParseMode::GreedyOnceStarted { - let terminators = - [self.terminators.clone(), parse_context.terminators.clone()].concat(); - max_idx = trim_to_terminator(segments, matched_idx, &terminators, parse_context)?; - first_match = false; - } - - if elem_match.matched.is_some() { - child_matches.push(elem_match); - continue; - } - - child_matches.append(&mut elem_match.child_matches); - insert_segments.append(&mut elem_match.insert_segments); - } - - insert_segments.extend(meta_buffer.into_iter().map(|meta| (matched_idx, meta))); - - if matches!( - self.parse_mode, - ParseMode::Greedy | ParseMode::GreedyOnceStarted - ) && max_idx > matched_idx - { - let idx = skip_start_index_forward_to_code(segments, matched_idx, max_idx); - let stop_idx = skip_stop_index_backward_to_code(segments, max_idx, idx); - - if stop_idx > idx { - child_matches.push(MatchResult { - span: Span { - start: idx, - end: stop_idx, - }, - matched: Matched::SyntaxKind(SyntaxKind::Unparsable).into(), - ..Default::default() - }); - matched_idx = stop_idx; - } - } - - Ok(MatchResult { - span: Span { - start: start_idx, - end: matched_idx, - }, - matched: None, - insert_segments, - child_matches, - }) - } - fn cache_key(&self) -> MatchableCacheKey { self.cache_key } @@ -352,21 +167,23 @@ impl Bracketed { self.bracket_type = bracket_type; } - fn get_bracket_from_dialect(&self, parse_context: &ParseContext) -> BracketInfo { - let bracket_pairs = parse_context.dialect().bracket_sets(self.bracket_pairs_set); + pub(crate) fn outer_allow_gaps(&self) -> bool { + self.allow_gaps + } + + fn get_bracket_from_dialect(&self, dialect: &Dialect) -> BracketInfo { + let bracket_pairs = dialect.bracket_sets(self.bracket_pairs_set); for (bracket_type, start_ref, end_ref, persists) in bracket_pairs { if bracket_type == self.bracket_type { - let start_bracket = parse_context.dialect().r#ref(start_ref); - let end_bracket = parse_context.dialect().r#ref(end_ref); + let start_bracket = dialect.r#ref(start_ref); + let end_bracket = dialect.r#ref(end_ref); return Ok((start_bracket, end_bracket, persists)); } } Err(format!( "bracket_type {:?} not found in bracket_pairs ({}) of {:?} dialect.", - self.bracket_type, - self.bracket_pairs_set, - parse_context.dialect().name + self.bracket_type, self.bracket_pairs_set, dialect.name )) } } @@ -396,81 +213,11 @@ impl MatchableTrait for Bracketed { fn simple( &self, - parse_context: &ParseContext, + dialect: &Dialect, crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { - let (start_bracket, _, _) = self.get_bracket_from_dialect(parse_context).unwrap(); - start_bracket.simple(parse_context, crumbs) - } - - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - parse_context: &mut ParseContext, - ) -> Result { - let (start_bracket, end_bracket, bracket_persists) = - self.get_bracket_from_dialect(parse_context).unwrap(); - - let start_match = parse_context.deeper_match(false, &[], |ctx| { - start_bracket.match_segments(segments, idx, ctx) - })?; - - if !start_match.has_match() { - return Ok(MatchResult::empty_at(idx)); - } - - let start_match_span = start_match.span; - - let bracketed_match = resolve_bracket( - segments, - start_match, - start_bracket.clone(), - &[start_bracket], - std::slice::from_ref(&end_bracket), - &[bracket_persists], - parse_context, - false, - )?; - - let mut idx = start_match_span.end; - let mut end_idx = bracketed_match.span.end - 1; - - if self.allow_gaps { - idx = skip_start_index_forward_to_code(segments, idx, segments.len() as u32); - end_idx = skip_stop_index_backward_to_code(segments, end_idx, idx); - } - - let mut content_match = - parse_context.deeper_match(true, std::slice::from_ref(&end_bracket), |ctx| { - self.this - .match_segments(&segments[..end_idx as usize], idx, ctx) - })?; - - if content_match.span.end != end_idx && self.parse_mode == ParseMode::Strict { - return Ok(MatchResult::empty_at(idx)); - } - - let intermediate_slice = Span { - start: content_match.span.end, - end: bracketed_match.span.end - 1, - }; - - if !self.allow_gaps && intermediate_slice.start == intermediate_slice.end { - unimplemented!() - } - - let mut child_matches = bracketed_match.child_matches; - if content_match.matched.is_some() { - child_matches.push(content_match); - } else { - child_matches.append(&mut content_match.child_matches); - } - - Ok(MatchResult { - child_matches, - ..bracketed_match - }) + let (start_bracket, _, _) = self.get_bracket_from_dialect(dialect).unwrap(); + start_bracket.simple(dialect, crumbs) } fn cache_key(&self) -> MatchableCacheKey { diff --git a/crates/lib-core/src/parser/lookahead.rs b/crates/lib-core/src/parser/lookahead.rs index 6f60898b2..21164860a 100644 --- a/crates/lib-core/src/parser/lookahead.rs +++ b/crates/lib-core/src/parser/lookahead.rs @@ -1,12 +1,8 @@ use ahash::AHashSet; -use super::context::ParseContext; -use super::match_algorithms::skip_start_index_forward_to_code; -use super::match_result::MatchResult; use super::matchable::{Matchable, MatchableCacheKey, MatchableTrait, next_matchable_cache_key}; -use super::segments::ErasedSegment; +use crate::dialects::Dialect; use crate::dialects::syntax::SyntaxSet; -use crate::errors::SQLParseError; /// A matcher that excludes patterns based on lookahead. /// @@ -31,6 +27,14 @@ impl LookaheadExclude { cache_key: next_matchable_cache_key(), } } + + pub(crate) fn first_token(&self) -> &str { + self.first_token + } + + pub(crate) fn lookahead_token(&self) -> &str { + self.lookahead_token + } } impl MatchableTrait for LookaheadExclude { @@ -45,44 +49,13 @@ impl MatchableTrait for LookaheadExclude { fn simple( &self, - _parse_context: &ParseContext, + _dialect: &Dialect, _crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { // LookaheadExclude doesn't have simple matching None } - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - _parse_context: &mut ParseContext, - ) -> Result { - // Check if we're at a valid position - if idx >= segments.len() as u32 { - return Ok(MatchResult::empty_at(idx)); - } - - // Check if current token matches first pattern (case-insensitive) - let current_raw = segments[idx as usize].raw(); - if current_raw.eq_ignore_ascii_case(self.first_token) { - // Look ahead for second token, skipping any whitespace - let next_idx = - skip_start_index_forward_to_code(segments, idx + 1, segments.len() as u32); - - if next_idx < segments.len() as u32 { - let next_raw = segments[next_idx as usize].raw(); - if next_raw.eq_ignore_ascii_case(self.lookahead_token) { - // Match found - return a match to indicate this should be excluded - return Ok(MatchResult::from_span(idx, idx + 1)); - } - } - } - - // No match - don't exclude - Ok(MatchResult::empty_at(idx)) - } - fn cache_key(&self) -> MatchableCacheKey { self.cache_key } diff --git a/crates/lib-core/src/parser/markers.rs b/crates/lib-core/src/parser/markers.rs index bc6bec1cf..c3a8afe7c 100644 --- a/crates/lib-core/src/parser/markers.rs +++ b/crates/lib-core/src/parser/markers.rs @@ -323,12 +323,12 @@ mod tests { Test { raw: "\n".to_string(), start: 2..2, - end: 3..1, + end: Range { start: 3, end: 1 }, }, Test { raw: "boo\n".to_string(), start: 2..2, - end: 3..1, + end: Range { start: 3, end: 1 }, }, Test { raw: "boo\nfoo".to_string(), @@ -393,11 +393,11 @@ mod tests { // TODO Finish these tests // Check less than assert!(a_pos < b_pos && b_pos < c_pos); - assert!(!(c_pos < a_pos)); + assert!(c_pos >= a_pos); // Check greater than assert!(c_pos > a_pos && c_pos > b_pos); - assert!(!(a_pos > c_pos)); + assert!(a_pos <= c_pos); // Check less than or equal assert!(all_pos.iter().all(|p| a_pos <= **p)); diff --git a/crates/lib-core/src/parser/match_algorithms.rs b/crates/lib-core/src/parser/match_algorithms.rs index bbedf2c06..e2a7ff5e6 100644 --- a/crates/lib-core/src/parser/match_algorithms.rs +++ b/crates/lib-core/src/parser/match_algorithms.rs @@ -1,15 +1,9 @@ -use ahash::AHashMap; -use itertools::{Itertools as _, enumerate, multiunzip}; use smol_str::StrExt; -use super::context::ParseContext; -use super::match_result::{MatchResult, Matched, Span}; -use super::matchable::{Matchable, MatchableTrait}; use super::segments::ErasedSegment; -use crate::dialects::syntax::{SyntaxKind, SyntaxSet}; -use crate::errors::SQLParseError; +use crate::dialects::syntax::SyntaxSet; -pub fn skip_start_index_forward_to_code( +pub(crate) fn skip_start_index_forward_to_code( segments: &[ErasedSegment], start_idx: u32, max_idx: u32, @@ -24,7 +18,7 @@ pub fn skip_start_index_forward_to_code( idx } -pub fn skip_stop_index_backward_to_code( +pub(crate) fn skip_stop_index_backward_to_code( segments: &[ErasedSegment], stop_idx: u32, min_idx: u32, @@ -39,7 +33,7 @@ pub fn skip_stop_index_backward_to_code( idx } -pub fn first_trimmed_raw(seg: &ErasedSegment) -> String { +pub(crate) fn first_trimmed_raw(seg: &ErasedSegment) -> String { seg.raw() .to_uppercase_smolstr() .split(char::is_whitespace) @@ -48,7 +42,7 @@ pub fn first_trimmed_raw(seg: &ErasedSegment) -> String { .unwrap_or_default() } -pub fn first_non_whitespace( +pub(crate) fn first_non_whitespace( segments: &[ErasedSegment], start_idx: u32, ) -> Option<(String, &SyntaxSet)> { @@ -60,488 +54,3 @@ pub fn first_non_whitespace( None } - -pub fn prune_options( - options: &[Matchable], - segments: &[ErasedSegment], - parse_context: &mut ParseContext, - start_idx: u32, -) -> Vec { - let mut available_options = vec![]; - - // Find the first code element to match against. - let Some((first_raw, first_types)) = first_non_whitespace(segments, start_idx) else { - return options.to_vec(); - }; - - for opt in options { - let Some(simple) = opt.simple(parse_context, None) else { - // This element is not simple, we have to do a - // full match with it... - available_options.push(opt.clone()); - continue; - }; - - // Otherwise we have a simple option, so let's use - // it for pruning. - let (simple_raws, simple_types) = simple; - let mut matched = false; - - // We want to know if the first meaningful element of the str_buff - // matches the option, based on either simple _raw_ matching or - // simple _type_ matching. - - // Match Raws - if simple_raws.contains(&first_raw) { - // If we get here, it's matched the FIRST element of the string buffer. - available_options.push(opt.clone()); - matched = true; - } - - if !matched && first_types.intersects(&simple_types) { - available_options.push(opt.clone()); - } - } - - available_options -} - -pub fn longest_match( - segments: &[ErasedSegment], - matchers: &[Matchable], - idx: u32, - parse_context: &mut ParseContext, -) -> Result<(MatchResult, Option), SQLParseError> { - let max_idx = segments.len() as u32; - - if matchers.is_empty() || idx == max_idx { - return Ok((MatchResult::empty_at(idx), None)); - } - - let available_options = prune_options(matchers, segments, parse_context, idx); - let available_options_count = available_options.len(); - - if available_options.is_empty() { - return Ok((MatchResult::empty_at(idx), None)); - } - - let terminators = parse_context.terminators.clone(); - let cache_position = segments[idx as usize].get_position_marker().unwrap(); - - let loc_key = ( - segments[idx as usize].raw().clone(), - cache_position.working_loc(), - segments[idx as usize].get_type(), - max_idx, - ); - - let loc_key = parse_context.loc_key(loc_key); - - let mut best_match = MatchResult::empty_at(idx); - let mut best_matcher = None; - - 'matcher: for (matcher_idx, matcher) in enumerate(available_options) { - let matcher_key = matcher.cache_key(); - let res_match = parse_context.check_parse_cache(loc_key, matcher_key); - - let res_match = match res_match { - Some(res_match) => res_match, - None => { - let res_match = matcher.match_segments(segments, idx, parse_context)?; - parse_context.put_parse_cache(loc_key, matcher_key, res_match.clone()); - res_match - } - }; - - if res_match.has_match() && res_match.span.end == max_idx { - return Ok((res_match, matcher.into())); - } - - if res_match.is_better_than(&best_match) { - best_match = res_match; - best_matcher = matcher.into(); - - if matcher_idx == available_options_count - 1 { - break 'matcher; - } else if !terminators.is_empty() { - let next_code_idx = skip_start_index_forward_to_code( - segments, - best_match.span.end, - segments.len() as u32, - ); - - if next_code_idx == segments.len() as u32 { - break 'matcher; - } - - for terminator in &terminators { - let terminator_match = - terminator.match_segments(segments, next_code_idx, parse_context)?; - - if terminator_match.has_match() { - break 'matcher; - } - } - } - } - } - - Ok((best_match, best_matcher)) -} - -fn next_match( - segments: &[ErasedSegment], - idx: u32, - matchers: &[Matchable], - parse_context: &mut ParseContext, -) -> Result<(MatchResult, Option), SQLParseError> { - let max_idx = segments.len() as u32; - - if idx >= max_idx { - return Ok((MatchResult::empty_at(idx), None)); - } - - let mut raw_simple_map: AHashMap> = AHashMap::new(); - let mut type_simple_map: AHashMap> = AHashMap::new(); - - for (idx, matcher) in enumerate(matchers) { - let (raws, types) = matcher.simple(parse_context, None).unwrap(); - - raw_simple_map.reserve(raws.len()); - type_simple_map.reserve(types.len()); - - for raw in raws { - raw_simple_map.entry(raw).or_default().push(idx); - } - - for typ in types { - type_simple_map.entry(typ).or_default().push(idx); - } - } - - for idx in idx..max_idx { - let seg = &segments[idx as usize]; - let mut matcher_idxs = raw_simple_map - .get(&first_trimmed_raw(seg)) - .cloned() - .unwrap_or_default(); - - let keys = type_simple_map.keys().copied().collect(); - let type_overlap = seg.class_types().clone().intersection(&keys); - - for typ in type_overlap { - matcher_idxs.extend(type_simple_map[&typ].clone()); - } - - if matcher_idxs.is_empty() { - continue; - } - - matcher_idxs.sort(); - for matcher_idx in matcher_idxs { - let matcher = &matchers[matcher_idx]; - let match_result = matcher.match_segments(segments, idx, parse_context)?; - - if match_result.has_match() { - return Ok((match_result, matcher.clone().into())); - } - } - } - - Ok((MatchResult::empty_at(idx), None)) -} - -#[allow(clippy::too_many_arguments)] -pub fn resolve_bracket( - segments: &[ErasedSegment], - opening_match: MatchResult, - opening_matcher: Matchable, - start_brackets: &[Matchable], - end_brackets: &[Matchable], - bracket_persists: &[bool], - parse_context: &mut ParseContext, - nested_match: bool, -) -> Result { - let type_idx = start_brackets - .iter() - .position(|it| it == &opening_matcher) - .unwrap(); - let mut matched_idx = opening_match.span.end; - let mut child_matches = vec![opening_match.clone()]; - - let matchers = [start_brackets, end_brackets].concat(); - loop { - let (match_result, matcher) = next_match(segments, matched_idx, &matchers, parse_context)?; - - if !match_result.has_match() { - return Err(SQLParseError { - description: "Couldn't find closing bracket for opening bracket.".into(), - segment: segments[opening_match.span.start as usize].clone().into(), - }); - } - - let matcher = matcher.unwrap(); - if end_brackets.contains(&matcher) { - let closing_idx = end_brackets.iter().position(|it| it == &matcher).unwrap(); - - if closing_idx == type_idx { - let match_span = match_result.span; - let persists = bracket_persists[type_idx]; - let insert_segments = vec![ - (opening_match.span.end, SyntaxKind::Indent), - (match_result.span.start, SyntaxKind::Dedent), - ]; - - child_matches.push(match_result); - let match_result = MatchResult { - span: Span { - start: opening_match.span.start, - end: match_span.end, - }, - matched: None, - insert_segments, - child_matches, - }; - - if !persists { - return Ok(match_result); - } - - return Ok(match_result.wrap(Matched::SyntaxKind(SyntaxKind::Bracketed))); - } - - return Err(SQLParseError { - description: "Found unexpected end bracket!".into(), - segment: segments[(match_result.span.end - 1) as usize] - .clone() - .into(), - }); - } - - let inner_match = resolve_bracket( - segments, - match_result, - matcher, - start_brackets, - end_brackets, - bracket_persists, - parse_context, - false, - )?; - - matched_idx = inner_match.span.end; - if nested_match { - child_matches.push(inner_match); - } - } -} - -type BracketMatch = Result<(MatchResult, Option, Vec), SQLParseError>; - -fn next_ex_bracket_match( - segments: &[ErasedSegment], - idx: u32, - matchers: &[Matchable], - parse_context: &mut ParseContext, - bracket_pairs_set: &'static str, -) -> BracketMatch { - let max_idx = segments.len() as u32; - - if idx >= max_idx { - return Ok((MatchResult::empty_at(idx), None, Vec::new())); - } - - let (_, start_bracket_refs, end_bracket_refs, bracket_persists): ( - Vec<_>, - Vec<_>, - Vec<_>, - Vec<_>, - ) = multiunzip(parse_context.dialect().bracket_sets(bracket_pairs_set)); - - let start_brackets = start_bracket_refs - .into_iter() - .map(|seg_ref| parse_context.dialect().r#ref(seg_ref)) - .collect_vec(); - - let end_brackets = end_bracket_refs - .into_iter() - .map(|seg_ref| parse_context.dialect().r#ref(seg_ref)) - .collect_vec(); - - let all_matchers = [matchers, &start_brackets, &end_brackets].concat(); - - let mut matched_idx = idx; - let mut child_matches: Vec = Vec::new(); - - loop { - let (match_result, matcher) = - next_match(segments, matched_idx, &all_matchers, parse_context)?; - if !match_result.has_match() { - return Ok((match_result, matcher.clone(), child_matches)); - } - - if let Some(matcher) = matcher - .as_ref() - .filter(|matcher| matchers.contains(matcher)) - { - return Ok((match_result, Some(matcher.clone()), child_matches)); - } - - if matcher - .as_ref() - .is_some_and(|matcher| end_brackets.contains(matcher)) - { - return Ok((MatchResult::empty_at(idx), None, Vec::new())); - } - - let bracket_match = resolve_bracket( - segments, - match_result, - matcher.unwrap(), - &start_brackets, - &end_brackets, - &bracket_persists, - parse_context, - true, - )?; - - matched_idx = bracket_match.span.end; - child_matches.push(bracket_match); - } -} - -pub fn greedy_match( - segments: &[ErasedSegment], - idx: u32, - parse_context: &mut ParseContext, - matchers: &[Matchable], - include_terminator: bool, - nested_match: bool, -) -> Result { - let mut working_idx = idx; - let mut stop_idx: u32; - let mut child_matches = Vec::new(); - let mut matched; - - loop { - let (match_result, matcher, inner_matches) = - parse_context.deeper_match(false, &[], |ctx| { - next_ex_bracket_match(segments, working_idx, matchers, ctx, "bracket_pairs") - })?; - - matched = match_result; - - if nested_match { - child_matches.extend(inner_matches); - } - - if !matched.has_match() { - return Ok(MatchResult { - span: Span { - start: idx, - end: segments.len() as u32, - }, - matched: None, - insert_segments: Vec::new(), - child_matches, - }); - } - - let start_idx = matched.span.start; - stop_idx = matched.span.end; - - let matcher = matcher.unwrap(); - let (strings, types) = matcher.simple(parse_context, None).unwrap(); - - if types.is_empty() && strings.iter().all(|s| s.chars().all(|c| c.is_alphabetic())) { - let mut allowable_match = start_idx == working_idx; - - for idx in (working_idx..=start_idx).rev() { - if segments[idx as usize - 1].is_meta() { - continue; - } - - allowable_match = matches!( - segments[idx as usize - 1].get_type(), - SyntaxKind::Whitespace | SyntaxKind::Newline - ); - - break; - } - - if !allowable_match { - working_idx = stop_idx; - continue; - } - } - - break; - } - - if include_terminator { - return Ok(MatchResult { - span: Span { - start: idx, - end: stop_idx, - }, - ..MatchResult::default() - }); - } - - let stop_idx = skip_stop_index_backward_to_code(segments, matched.span.start, idx); - - let span = if idx == stop_idx { - Span { - start: idx, - end: matched.span.start, - } - } else { - Span { - start: idx, - end: stop_idx, - } - }; - - Ok(MatchResult { - span, - child_matches, - ..Default::default() - }) -} - -pub fn trim_to_terminator( - segments: &[ErasedSegment], - idx: u32, - terminators: &[Matchable], - parse_context: &mut ParseContext, -) -> Result { - if idx >= segments.len() as u32 { - return Ok(segments.len() as u32); - } - - let early_return = parse_context.deeper_match(false, &[], |ctx| { - let pruned_terms = prune_options(terminators, segments, ctx, idx); - - for term in pruned_terms { - if term.match_segments(segments, idx, ctx)?.has_match() { - return Ok(Some(idx)); - } - } - - Ok(None) - })?; - - if let Some(idx) = early_return { - return Ok(idx); - } - - let term_match = parse_context.deeper_match(false, &[], |ctx| { - greedy_match(segments, idx, ctx, terminators, false, false) - })?; - - Ok(skip_stop_index_backward_to_code( - segments, - term_match.span.end, - idx, - )) -} diff --git a/crates/lib-core/src/parser/matchable.rs b/crates/lib-core/src/parser/matchable.rs index be56985a1..0ad643ad6 100644 --- a/crates/lib-core/src/parser/matchable.rs +++ b/crates/lib-core/src/parser/matchable.rs @@ -5,7 +5,6 @@ use std::sync::atomic::{AtomicU32, Ordering}; use ahash::AHashSet; use enum_dispatch::enum_dispatch; -use super::context::ParseContext; use super::grammar::anyof::AnyNumberOf; use super::grammar::conditional::Conditional; use super::grammar::delimited::Delimited; @@ -13,15 +12,12 @@ use super::grammar::noncode::NonCodeMatcher; use super::grammar::sequence::{Bracketed, Sequence}; use super::grammar::{Anything, Nothing, Ref}; use super::lookahead::LookaheadExclude; -use super::match_result::MatchResult; use super::node_matcher::NodeMatcher; use super::parsers::{CodeParser, MultiStringParser, RegexParser, StringParser, TypedParser}; -use super::segments::ErasedSegment; use super::segments::bracketed::BracketedSegmentMatcher; use super::segments::meta::MetaSegment; use crate::dialects::Dialect; use crate::dialects::syntax::{SyntaxKind, SyntaxSet}; -use crate::errors::SQLParseError; #[derive(Clone, Debug, PartialEq)] pub struct Matchable { @@ -43,6 +39,10 @@ impl Matchable { } } + pub(crate) fn ptr(&self) -> *const MatchableTraitImpl { + Arc::as_ptr(&self.inner) + } + pub fn get_mut(&mut self) -> &mut MatchableTraitImpl { Arc::get_mut(&mut self.inner).unwrap() } @@ -158,21 +158,12 @@ pub trait MatchableTrait { // Note: the crumbs argument is used to detect recursion. fn simple( &self, - parse_context: &ParseContext, + dialect: &Dialect, crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { - let match_grammar = self.match_grammar(parse_context.dialect())?; + let match_grammar = self.match_grammar(dialect)?; - match_grammar.simple(parse_context, crumbs) - } - - fn match_segments( - &self, - _segments: &[ErasedSegment], - _idx: u32, - _parse_context: &mut ParseContext, - ) -> Result { - todo!(); + match_grammar.simple(dialect, crumbs) } fn cache_key(&self) -> MatchableCacheKey { diff --git a/crates/lib-core/src/parser/node_matcher.rs b/crates/lib-core/src/parser/node_matcher.rs index 3a114cddd..28d116234 100644 --- a/crates/lib-core/src/parser/node_matcher.rs +++ b/crates/lib-core/src/parser/node_matcher.rs @@ -3,11 +3,7 @@ use std::sync::OnceLock; use super::matchable::MatchableTrait; use crate::dialects::Dialect; use crate::dialects::syntax::SyntaxKind; -use crate::errors::SQLParseError; -use crate::parser::context::ParseContext; -use crate::parser::match_result::{MatchResult, Matched}; use crate::parser::matchable::Matchable; -use crate::parser::segments::ErasedSegment; #[derive(Clone)] pub struct NodeMatcher { @@ -65,25 +61,4 @@ impl MatchableTrait for NodeMatcher { fn elements(&self) -> &[Matchable] { &[] } - - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - parse_context: &mut ParseContext, - ) -> Result { - if idx >= segments.len() as u32 { - return Ok(MatchResult::empty_at(idx)); - } - - if segments[idx as usize].get_type() == self.get_type() { - return Ok(MatchResult::from_span(idx, idx + 1)); - } - - let grammar = self.match_grammar(parse_context.dialect()); - let match_result = parse_context - .deeper_match(false, &[], |ctx| grammar.match_segments(segments, idx, ctx))?; - - Ok(match_result.wrap(Matched::SyntaxKind(self.node_kind))) - } } diff --git a/crates/lib-core/src/parser/parsers.rs b/crates/lib-core/src/parser/parsers.rs index 862a8a1ef..6d7b8896d 100644 --- a/crates/lib-core/src/parser/parsers.rs +++ b/crates/lib-core/src/parser/parsers.rs @@ -1,13 +1,10 @@ use ahash::AHashSet; use fancy_regex::Regex; -use smol_str::SmolStr; -use super::context::ParseContext; -use super::match_result::{MatchResult, Matched, Span}; use super::matchable::{Matchable, MatchableCacheKey, MatchableTrait, next_matchable_cache_key}; use super::segments::ErasedSegment; +use crate::dialects::Dialect; use crate::dialects::syntax::{SyntaxKind, SyntaxSet}; -use crate::errors::SQLParseError; #[derive(Debug, Clone, PartialEq)] pub struct TypedParser { @@ -34,6 +31,14 @@ impl TypedParser { pub fn is_first_match(&self, segment: &ErasedSegment) -> bool { self.target_types.contains(segment.get_type()) } + + pub(crate) fn template(&self) -> SyntaxKind { + self.template + } + + pub(crate) fn kind(&self) -> SyntaxKind { + self.kind + } } impl MatchableTrait for TypedParser { @@ -43,35 +48,13 @@ impl MatchableTrait for TypedParser { fn simple( &self, - parse_context: &ParseContext, + dialect: &Dialect, crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { - let _ = (parse_context, crumbs); + let _ = (dialect, crumbs); (AHashSet::new(), self.target_types.clone()).into() } - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - _parse_context: &mut ParseContext, - ) -> Result { - let segment = &segments[idx as usize]; - if segment.is_type(self.template) { - return Ok(MatchResult { - span: Span { - start: idx, - end: idx + 1, - }, - matched: Matched::Newtype(self.kind).into(), - insert_segments: Vec::new(), - child_matches: Vec::new(), - }); - } - - Ok(MatchResult::empty_at(idx)) - } - fn cache_key(&self) -> MatchableCacheKey { self.cache_key } @@ -103,29 +86,12 @@ impl MatchableTrait for CodeParser { fn simple( &self, - _parse_context: &ParseContext, + _dialect: &Dialect, _crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { None } - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - _parse_context: &mut ParseContext, - ) -> Result { - if idx as usize >= segments.len() { - return Ok(MatchResult::empty_at(idx)); - } - - if segments[idx as usize].is_code() { - return Ok(MatchResult::from_span(idx, idx + 1)); - } - - Ok(MatchResult::empty_at(idx)) - } - fn cache_key(&self) -> MatchableCacheKey { self.cache_key } @@ -153,6 +119,14 @@ impl StringParser { cache_key: next_matchable_cache_key(), } } + + pub(crate) fn template(&self) -> &str { + &self.template + } + + pub(crate) fn kind(&self) -> SyntaxKind { + self.kind + } } impl MatchableTrait for StringParser { @@ -166,35 +140,12 @@ impl MatchableTrait for StringParser { fn simple( &self, - _parse_context: &ParseContext, + _dialect: &Dialect, _crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { (self.simple.clone(), SyntaxSet::EMPTY).into() } - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - _parse_context: &mut ParseContext, - ) -> Result { - let segment = &segments[idx as usize]; - - if segment.is_code() && self.template.eq_ignore_ascii_case(segment.raw()) { - return Ok(MatchResult { - span: Span { - start: idx, - end: idx + 1, - }, - matched: Matched::Newtype(self.kind).into(), - insert_segments: Vec::new(), - child_matches: Vec::new(), - }); - } - - Ok(MatchResult::empty_at(idx)) - } - fn cache_key(&self) -> MatchableCacheKey { self.cache_key } @@ -236,6 +187,10 @@ impl RegexParser { self.anti_template = Regex::new(&format!("(?i){anti_template}")).unwrap().into(); self } + + pub(crate) fn kind(&self) -> SyntaxKind { + self.kind + } } impl MatchableTrait for RegexParser { @@ -244,12 +199,12 @@ impl MatchableTrait for RegexParser { } fn is_optional(&self) -> bool { - unimplemented!() + false } fn simple( &self, - _parse_context: &ParseContext, + _dialect: &Dialect, _crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { // Does this matcher support a uppercase hash matching route? @@ -257,37 +212,6 @@ impl MatchableTrait for RegexParser { None } - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - _parse_context: &mut ParseContext, - ) -> Result { - let segment = &segments[idx as usize]; - let segment_raw_upper = - SmolStr::from_iter(segment.raw().chars().map(|ch| ch.to_ascii_uppercase())); - if let Some(result) = self.template.find(&segment_raw_upper).ok().flatten() - && result.as_str() == segment_raw_upper - && !self.anti_template.as_ref().is_some_and(|anti_template| { - anti_template - .is_match(&segment_raw_upper) - .unwrap_or_default() - }) - { - return Ok(MatchResult { - span: Span { - start: idx, - end: idx + 1, - }, - matched: Matched::Newtype(self.kind).into(), - insert_segments: Vec::new(), - child_matches: Vec::new(), - }); - } - - Ok(MatchResult::empty_at(idx)) - } - fn cache_key(&self) -> MatchableCacheKey { self.cache_key } @@ -317,6 +241,14 @@ impl MultiStringParser { cache: next_matchable_cache_key(), } } + + pub(crate) fn templates(&self) -> Vec<&str> { + self.templates.iter().map(|it| it.as_str()).collect() + } + + pub(crate) fn kind(&self) -> SyntaxKind { + self.kind + } } impl MatchableTrait for MultiStringParser { @@ -325,39 +257,17 @@ impl MatchableTrait for MultiStringParser { } fn is_optional(&self) -> bool { - todo!() + false } fn simple( &self, - _parse_context: &ParseContext, + _dialect: &Dialect, _crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { (self.simple.clone(), SyntaxSet::EMPTY).into() } - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - _parse_context: &mut ParseContext, - ) -> Result { - let segment = &segments[idx as usize]; - - if segment.is_code() && self.templates.contains(&segment.raw().to_ascii_uppercase()) { - return Ok(MatchResult { - span: Span { - start: idx, - end: idx + 1, - }, - matched: Matched::Newtype(self.kind).into(), - ..<_>::default() - }); - } - - Ok(MatchResult::empty_at(idx)) - } - fn cache_key(&self) -> MatchableCacheKey { self.cache } diff --git a/crates/lib-core/src/parser/segments.rs b/crates/lib-core/src/parser/segments.rs index 5bde445eb..82abde1cc 100644 --- a/crates/lib-core/src/parser/segments.rs +++ b/crates/lib-core/src/parser/segments.rs @@ -1,5 +1,4 @@ pub mod bracketed; -pub mod file; pub mod fix; pub mod from; pub mod generator; diff --git a/crates/lib-core/src/parser/segments/bracketed.rs b/crates/lib-core/src/parser/segments/bracketed.rs index 94e7414a8..d9b10ac79 100644 --- a/crates/lib-core/src/parser/segments/bracketed.rs +++ b/crates/lib-core/src/parser/segments/bracketed.rs @@ -1,10 +1,7 @@ use ahash::AHashSet; -use super::ErasedSegment; -use crate::dialects::syntax::{SyntaxKind, SyntaxSet}; -use crate::errors::SQLParseError; -use crate::parser::context::ParseContext; -use crate::parser::match_result::MatchResult; +use crate::dialects::Dialect; +use crate::dialects::syntax::SyntaxSet; use crate::parser::matchable::{ Matchable, MatchableCacheKey, MatchableTrait, next_matchable_cache_key, }; @@ -35,25 +32,12 @@ impl MatchableTrait for BracketedSegmentMatcher { fn simple( &self, - _parse_context: &ParseContext, + _dialect: &Dialect, _crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { None } - fn match_segments( - &self, - segments: &[ErasedSegment], - idx: u32, - _parse_context: &mut ParseContext, - ) -> Result { - if segments[idx as usize].get_type() == SyntaxKind::Bracketed { - return Ok(MatchResult::from_span(idx, idx + 1)); - } - - Ok(MatchResult::empty_at(idx)) - } - fn cache_key(&self) -> MatchableCacheKey { self.cache_key } diff --git a/crates/lib-core/src/parser/segments/file.rs b/crates/lib-core/src/parser/segments/file.rs deleted file mode 100644 index c2a004fc2..000000000 --- a/crates/lib-core/src/parser/segments/file.rs +++ /dev/null @@ -1,102 +0,0 @@ -use crate::dialects::init::DialectKind; -use crate::dialects::syntax::SyntaxKind; -use crate::errors::SQLParseError; -use crate::parser::context::ParseContext; -use crate::parser::matchable::MatchableTrait; -use crate::parser::segments::{ErasedSegment, SegmentBuilder, Tables}; - -#[derive(Debug, Clone, PartialEq)] -pub struct FileSegment; - -impl FileSegment { - pub fn of( - tables: &Tables, - dialect: DialectKind, - segments: Vec, - ) -> ErasedSegment { - SegmentBuilder::node(tables.next_id(), SyntaxKind::File, dialect, segments) - .position_from_segments() - .finish() - } - - pub fn root_parse( - &self, - tables: &Tables, - dialect: DialectKind, - segments: &[ErasedSegment], - parse_context: &mut ParseContext, - ) -> Result { - let start_idx = segments - .iter() - .position(|segment| segment.is_code()) - .unwrap_or(0) as u32; - - let end_idx = segments - .iter() - .rposition(|segment| segment.is_code()) - .map_or(start_idx, |idx| idx as u32 + 1); - - if start_idx == end_idx { - return Ok(FileSegment::of(tables, dialect, segments.to_vec())); - } - - let final_seg = segments.last().unwrap(); - assert!(final_seg.get_position_marker().is_some()); - - let file_segment = parse_context.dialect().r#ref("FileSegment"); - - let match_result = file_segment - .match_grammar(parse_context.dialect()) - .unwrap() - .match_segments(&segments[..end_idx as usize], start_idx, parse_context)?; - - let match_span = match_result.span; - let has_match = match_result.has_match(); - let mut matched = match_result.apply(tables, dialect, segments); - let unmatched = &segments[match_span.end as usize..end_idx as usize]; - - let content: &[ErasedSegment] = if !has_match { - &[SegmentBuilder::node( - tables.next_id(), - SyntaxKind::Unparsable, - dialect, - segments[start_idx as usize..end_idx as usize].to_vec(), - ) - .position_from_segments() - .finish()] - } else if !unmatched.is_empty() { - let idx = unmatched - .iter() - .position(|it| it.is_code()) - .unwrap_or(unmatched.len()); - let (head, tail) = unmatched.split_at(idx); - - matched.extend_from_slice(head); - matched.push( - SegmentBuilder::node( - tables.next_id(), - SyntaxKind::Unparsable, - dialect, - tail.to_vec(), - ) - .position_from_segments() - .finish(), - ); - &matched - } else { - matched.extend_from_slice(unmatched); - &matched - }; - - Ok(Self::of( - tables, - dialect, - [ - &segments[..start_idx as usize], - content, - &segments[end_idx as usize..], - ] - .concat(), - )) - } -} diff --git a/crates/lib-core/src/parser/segments/meta.rs b/crates/lib-core/src/parser/segments/meta.rs index 212b2ad59..276799565 100644 --- a/crates/lib-core/src/parser/segments/meta.rs +++ b/crates/lib-core/src/parser/segments/meta.rs @@ -2,11 +2,8 @@ use std::fmt::Debug; use ahash::AHashSet; -use super::ErasedSegment; +use crate::dialects::Dialect; use crate::dialects::syntax::{SyntaxKind, SyntaxSet}; -use crate::errors::SQLParseError; -use crate::parser::context::ParseContext; -use crate::parser::match_result::MatchResult; use crate::parser::matchable::{Matchable, MatchableTrait}; pub type Indent = MetaSegment; @@ -42,21 +39,9 @@ impl MatchableTrait for MetaSegment { fn simple( &self, - _parse_context: &ParseContext, + _dialect: &Dialect, _crumbs: Option>, ) -> Option<(AHashSet, SyntaxSet)> { None } - - fn match_segments( - &self, - _segments: &[ErasedSegment], - _idx: u32, - _parse_context: &mut ParseContext, - ) -> Result { - panic!( - "{} has no match method, it should only be used in a Sequence!", - std::any::type_name::() - ); - } } diff --git a/crates/lib/benches/parsing.rs b/crates/lib/benches/parsing.rs index 2ee250d3c..bfba2bd68 100644 --- a/crates/lib/benches/parsing.rs +++ b/crates/lib/benches/parsing.rs @@ -1,11 +1,8 @@ use criterion::{Criterion, criterion_group, criterion_main}; use sqruff_lib::core::config::FluffConfig; -use sqruff_lib::core::test_functions::fresh_ansi_dialect; -use sqruff_lib_core::dialects::syntax::SyntaxKind; use sqruff_lib_core::parser::Parser; -use sqruff_lib_core::parser::context::ParseContext; -use sqruff_lib_core::parser::matchable::MatchableTrait as _; -use sqruff_lib_core::parser::segments::test_functions::lex; +use sqruff_lib_core::parser::lexer::Lexer; +use sqruff_lib_core::parser::segments::Tables; use std::hint::black_box; include!("shims/global_alloc_overwrite.rs"); @@ -78,8 +75,6 @@ and ( order by t1.id desc"#; fn parse(c: &mut Criterion) { - let dialect = fresh_ansi_dialect(); - let passes = [ ("parse_simple_query", SIMPLE_QUERY), ("parse_expression_recursion", EXPRESSION_RECURSION), @@ -88,20 +83,16 @@ fn parse(c: &mut Criterion) { for (name, source) in passes { let config = FluffConfig::default(); - let config_for_parser = config.clone(); - let parser: Parser = (&config_for_parser).into(); - let mut ctx: ParseContext = (&parser).into(); - let segment = dialect.r#ref("FileSegment"); - let mut segments = lex(config.get_dialect(), source); - - if segments.last().unwrap().get_type() == SyntaxKind::EndOfFile { - segments.pop(); - } + let parser: Parser = (&config).into(); + let tables = Tables::default(); + let lexer = Lexer::from(config.get_dialect()); + let (segments, errors) = lexer.lex(&tables, source); + assert!(errors.is_empty()); c.bench_function(name, |b| { b.iter(|| { - let match_result = segment.match_segments(&segments, 0, &mut ctx).unwrap(); - black_box(match_result); + let parsed = parser.parse(&tables, &segments).unwrap(); + black_box(parsed); }); }); } diff --git a/crates/lib/src/tests.rs b/crates/lib/src/tests.rs index 541f9a7c1..63e910e51 100644 --- a/crates/lib/src/tests.rs +++ b/crates/lib/src/tests.rs @@ -2,11 +2,8 @@ use itertools::Itertools; use sqruff_lib::core::config::FluffConfig; use sqruff_lib::core::linter::core::Linter; use sqruff_lib::core::test_functions::fresh_ansi_dialect; -use sqruff_lib_core::dialects::init::DialectKind; use sqruff_lib_core::dialects::syntax::SyntaxKind; use sqruff_lib_core::parser::Parser; -use sqruff_lib_core::parser::context::ParseContext; -use sqruff_lib_core::parser::matchable::MatchableTrait; use sqruff_lib_core::parser::segments::Tables; use sqruff_lib_core::parser::segments::test_functions::lex; @@ -148,24 +145,15 @@ fn test_dialect_ansi_specific_segment_parses() { ), ]; - let dialect = fresh_ansi_dialect(); let config: FluffConfig = FluffConfig::new(<_>::default(), None, None); for (segment_ref, sql_string) in cases { let config = config.clone(); let parser: Parser = (&config).into(); - let mut ctx: ParseContext = (&parser).into(); - - let segment = dialect.r#ref(segment_ref); - let mut segments = lex(&dialect, sql_string); - - if segments.last().unwrap().get_type() == SyntaxKind::EndOfFile { - segments.pop(); - } + let segments = lex(config.get_dialect(), sql_string); let tables = Tables::default(); - let match_result = segment.match_segments(&segments, 0, &mut ctx).unwrap(); - let mut parsed = match_result.apply(&tables, DialectKind::Ansi, &segments); + let mut parsed = parser.parse_as(&tables, segment_ref, &segments).unwrap(); assert_eq!(parsed.len(), 1, "failed {segment_ref}, {sql_string}");