1- use regex_automata:: { dfa:: { Automaton , dense} , Anchored , util:: start} ;
2- use pyo3:: prelude:: * ;
1+ use core:: iter:: Iterator ;
2+ use std:: vec:: Vec ;
3+ use std:: collections:: VecDeque ;
4+ use regex_automata:: { dfa:: { self , dense, Automaton } , util:: { primitives:: StateID , start} , Anchored } ;
5+ //use pyo3::prelude::*;
6+
7+
8+ /// We represent a terminal as a str representing the regex matching that
9+ /// terminal. This choice is temporary to facilitate inter-language calling.
10+ /// An accept sequence is then a list of str, each a regex.
11+
12+
13+ /// Track states of DFAs with their name. This is useful when iterating over
14+ /// states of more than one DFA.
15+ struct DFAState {
16+ /// The regex defining this DFA.
17+ regex : Box < str > ,
18+ /// The state of this DFA.
19+ state_id : StateID
20+ }
21+
322
423/// Consume the longest prefix of input that is accepted by the DFA, returning
524/// the remainder.
@@ -9,26 +28,25 @@ use pyo3::prelude::*;
928/// and there is still string left, return (true, remainder).
1029///
1130/// # Examples
12- ///
1331/// ```
1432/// let re = r"[a-zA-Z_]\w*";
15- /// let result = consume_prefix(re, "this_is_a_python_name");
16- /// assert_eq!(result, (true, ""));
33+ /// let dfa = dense::DFA::new(re).unwrap();
34+ ///
35+ /// let result = consume_prefix(&dfa, "this_is_a_python_name");
36+ /// assert_eq!(result, (true, Some(String::new())));
1737///
18- /// let result = consume_prefix(re , "this_is_a_python_name followed_by_other_stuff");
38+ /// let result = consume_prefix(&dfa , "this_is_a_python_name followed_by_other_stuff");
1939/// assert_eq!(result, (true, Some(String::from(" followed_by_other_stuff"))));
2040///
21- /// let result = consume_prefix(re , "this_is't_a_python_name");
41+ /// let result = consume_prefix(&dfa , "this_is't_a_python_name");
2242/// assert_eq!(result, (true, Some(String::from("'t_a_python_name"))));
2343///
24- /// let result = consume_prefix(re , "'tai'nt_a_python_name");
44+ /// let result = consume_prefix(&dfa , "'tai'nt_a_python_name");
2545/// assert_eq!(result, (false, None));
2646/// ```
27- #[ pyfunction]
28- fn consume_prefix ( re : & str , input : & str ) -> ( bool , Option < String > ) {
47+ fn consume_prefix ( dfa : & dyn Automaton , input : & str ) -> ( bool , Option < String > ) {
2948 // Only match starting from the beginning of the string.
3049 let config = start:: Config :: new ( ) . anchored ( Anchored :: Yes ) ;
31- let dfa = dense:: DFA :: new ( re) . unwrap ( ) ;
3250 let mut state = dfa. start_state ( & config) . unwrap ( ) ;
3351 let mut remainder = None ;
3452
@@ -56,9 +74,127 @@ fn consume_prefix(re: &str, input: &str) -> (bool, Option<String>) {
5674 return ( true , Some ( String :: new ( ) ) ) ;
5775}
5876
59- #[ pymodule]
60- fn rust_syncode ( _py : Python , m : & PyModule ) -> PyResult < ( ) > {
61- m. add_function ( wrap_pyfunction ! ( consume_prefix, m) ?) ?;
6277
63- Ok ( ( ) )
78+ /// Compute whether the string could match a sequence of terminals starting at a certain state in the first DFA.
79+ ///
80+ /// Given a DFA D(Q, Σ, δ, q0, F ), a string w ∈ Σ∗, a DFA state q ∈ Q and any sequence of terminals Λ = {τf +1, τf +2 . . . τf +d}, dmatch(w, q, Λ) = true, if either of the following conditions hold:
81+ /// 1. δ∗(w, q) ∈ live(Q) or
82+ /// 2. ∃w1 ∈ Σ∗, w2 ∈ Σ+ such that w1.w2 = w, δ∗(w1, q) ∈ F and Λ = {} or
83+ /// 3. ∃w1 ∈ Σ∗, w2 ∈ Σ∗ such that w1.w2 = w, δ∗(w1, q) ∈ F, and dmatch(w2, qτf +10 , {τf +2 . . . τf +d}) = true where qτf +10 is the start state corresponding to the DFA for τf +1.
84+ ///
85+ fn dmatch ( string : & str , starting_state : DFAState , sequence_of_terminals : Vec < String > ) -> bool {
86+ let dfa = dense:: Builder :: new ( ) . configure ( dense:: DFA :: config ( ) . start_kind ( dfa:: StartKind :: Anchored ) ) . build ( & starting_state. regex ) . unwrap ( ) ;
87+
88+ // Case 1: the DFA, starting at this state, consumes the entire input and is still alive.
89+ let mut state = starting_state. state_id ;
90+ for & b in string. as_bytes ( ) . iter ( ) {
91+ state = dfa. next_state ( state, b) ;
92+ }
93+ if !dfa. is_dead_state ( state) {
94+ return true ;
95+ }
96+
97+ // Case 2: The DFA consumes a prefix of the string, leaves a non-zero
98+ // suffix, and there is no sequence of terminals to follow.
99+ let mut state = starting_state. state_id ;
100+ for ( i, & b) in string. as_bytes ( ) . iter ( ) . enumerate ( ) {
101+ state = dfa. next_state ( state, b) ;
102+ if dfa. is_match_state ( state) & sequence_of_terminals. is_empty ( ) & ( i < string. len ( ) ) {
103+ return true ;
104+ }
105+ }
106+
107+ // Case 3: A prefix of the string is successfully consumed by the DFA, and
108+ // dmatch is true starting at the next member of sequence_of_terminals.
109+ let mut state = starting_state. state_id ;
110+ for ( i, & b) in string. as_bytes ( ) . iter ( ) . enumerate ( ) {
111+ state = dfa. next_state ( state, b) ;
112+ if dfa. is_match_state ( state) {
113+ let new_dfa = dense:: Builder :: new ( ) . configure ( dense:: DFA :: config ( ) . start_kind ( dfa:: StartKind :: Anchored ) ) . build ( & sequence_of_terminals[ 0 ] ) . unwrap ( ) ;
114+ let new_starting_state = new_dfa. start_state ( & start:: Config :: new ( ) . anchored ( Anchored :: Yes ) ) . unwrap ( ) ;
115+ let new_regex = sequence_of_terminals[ 0 ] . clone ( ) ;
116+ return dmatch ( & string[ i..] , DFAState { regex : new_regex. into ( ) , state_id : new_starting_state} , sequence_of_terminals[ 1 ..] . to_vec ( ) ) ;
117+ }
118+ }
119+
120+ // None of the previous cases succeeded, so dmatch is false.
121+ false
122+ }
123+
124+
125+ /// Return all states of a dfa by breadth-first search. There exists a private
126+ /// method that returns an iterator over all states. The suggested alternative
127+ /// is to traverse the graph manually. See
128+ /// https://github.com/rust-lang/regex/discussions/1223.
129+ fn states ( dfa : & dense:: DFA < Vec < u32 > > ) -> Vec < StateID > {
130+ let mut queue: VecDeque < StateID > = VecDeque :: new ( ) ;
131+ let mut explored: Vec < StateID > = Vec :: new ( ) ;
132+
133+ let start = dfa. start_state ( & start:: Config :: new ( ) ) . unwrap ( ) ;
134+
135+ explored. push ( start) ;
136+ queue. push_back ( start) ;
137+ while !queue. is_empty ( ) {
138+ let v = queue. pop_front ( ) . unwrap ( ) ;
139+ if dfa. is_dead_state ( v) {
140+ continue ;
141+ }
142+ for letter in dfa. byte_classes ( ) . representatives ( 0 ..=255 ) {
143+ let next = dfa. next_state ( start, letter. as_u8 ( ) . unwrap ( ) ) ;
144+ if !explored. contains ( & next) {
145+ explored. push ( next) ;
146+ }
147+ queue. push_back ( next) ;
148+ }
149+ }
150+ explored
151+ }
152+
153+
154+ /// Compute the union of all states of a list of regexes.
155+ fn all_dfa_states ( terminals : Vec < & str > ) -> Vec < DFAState > {
156+ let mut res: Vec < DFAState > = Vec :: new ( ) ;
157+ for terminal in terminals {
158+ let dfa = dense:: DFA :: new ( terminal) . unwrap ( ) ;
159+ let states = states ( & dfa) ;
160+ for state in states {
161+ res. push ( DFAState { regex : terminal. into ( ) , state_id : state} ) ;
162+ }
163+ }
164+ res
165+ }
166+
167+
168+ // #[pymodule]
169+ // fn rust_syncode(_py: Python, m: &PyModule) -> PyResult<()> {
170+ // m.add_function(wrap_pyfunction!(consume_prefix, m)?)?;
171+
172+ // Ok(())
173+ // }
174+
175+ #[ cfg( test) ]
176+ mod tests {
177+ use super :: * ;
178+
179+ #[ test]
180+ fn test_consume_prefix ( ) {
181+ let re = r"[a-zA-Z_]\w*" ;
182+ let dfa = dense:: DFA :: new ( re) . unwrap ( ) ;
183+ let result = consume_prefix ( & dfa, "this_is_a_python_name" ) ;
184+ assert_eq ! ( result, ( true , Some ( String :: new( ) ) ) ) ;
185+
186+ let result = consume_prefix ( & dfa, "this_is_a_python_name followed_by_other_stuff" ) ;
187+ assert_eq ! ( result, ( true , Some ( String :: from( " followed_by_other_stuff" ) ) ) ) ;
188+
189+ let result = consume_prefix ( & dfa, "this_is't_a_python_name" ) ;
190+ assert_eq ! ( result, ( true , Some ( String :: from( "'t_a_python_name" ) ) ) ) ;
191+
192+ let result = consume_prefix ( & dfa, "'tai'nt_a_python_name" ) ;
193+ assert_eq ! ( result, ( false , None ) ) ;
194+ }
195+
196+ #[ test]
197+ fn test_dmatch ( ) {
198+
199+ }
64200}
0 commit comments