Skip to content

Commit ff332d0

Browse files
committed
Implement dmatch.
This is definitely not correct, and tests are lacking. Will add in the next commit. Pushing now to share intermediate state of work.
1 parent a490bf5 commit ff332d0

File tree

1 file changed

+151
-15
lines changed

1 file changed

+151
-15
lines changed

src/lib.rs

Lines changed: 151 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,24 @@
1-
use regex_automata::{dfa::{Automaton, dense}, Anchored, util::start};
2-
use pyo3::prelude::*;
1+
use core::iter::Iterator;
2+
use std::vec::Vec;
3+
use std::collections::VecDeque;
4+
use regex_automata::{dfa::{self, dense, Automaton}, util::{primitives::StateID, start}, Anchored};
5+
//use pyo3::prelude::*;
6+
7+
8+
/// We represent a terminal as a str representing the regex matching that
9+
/// terminal. This choice is temporary to facilitate inter-language calling.
10+
/// An accept sequence is then a list of str, each a regex.
11+
12+
13+
/// Track states of DFAs with their name. This is useful when iterating over
14+
/// states of more than one DFA.
15+
struct DFAState {
16+
/// The regex defining this DFA.
17+
regex: Box<str>,
18+
/// The state of this DFA.
19+
state_id: StateID
20+
}
21+
322

423
/// Consume the longest prefix of input that is accepted by the DFA, returning
524
/// the remainder.
@@ -9,26 +28,25 @@ use pyo3::prelude::*;
928
/// and there is still string left, return (true, remainder).
1029
///
1130
/// # Examples
12-
///
1331
/// ```
1432
/// let re = r"[a-zA-Z_]\w*";
15-
/// let result = consume_prefix(re, "this_is_a_python_name");
16-
/// assert_eq!(result, (true, ""));
33+
/// let dfa = dense::DFA::new(re).unwrap();
34+
///
35+
/// let result = consume_prefix(&dfa, "this_is_a_python_name");
36+
/// assert_eq!(result, (true, Some(String::new())));
1737
///
18-
/// let result = consume_prefix(re, "this_is_a_python_name followed_by_other_stuff");
38+
/// let result = consume_prefix(&dfa, "this_is_a_python_name followed_by_other_stuff");
1939
/// assert_eq!(result, (true, Some(String::from(" followed_by_other_stuff"))));
2040
///
21-
/// let result = consume_prefix(re, "this_is't_a_python_name");
41+
/// let result = consume_prefix(&dfa, "this_is't_a_python_name");
2242
/// assert_eq!(result, (true, Some(String::from("'t_a_python_name"))));
2343
///
24-
/// let result = consume_prefix(re, "'tai'nt_a_python_name");
44+
/// let result = consume_prefix(&dfa, "'tai'nt_a_python_name");
2545
/// assert_eq!(result, (false, None));
2646
/// ```
27-
#[pyfunction]
28-
fn consume_prefix(re: &str, input: &str) -> (bool, Option<String>) {
47+
fn consume_prefix(dfa: &dyn Automaton, input: &str) -> (bool, Option<String>) {
2948
// Only match starting from the beginning of the string.
3049
let config = start::Config::new().anchored(Anchored::Yes);
31-
let dfa = dense::DFA::new(re).unwrap();
3250
let mut state = dfa.start_state(&config).unwrap();
3351
let mut remainder = None;
3452

@@ -56,9 +74,127 @@ fn consume_prefix(re: &str, input: &str) -> (bool, Option<String>) {
5674
return (true, Some(String::new()));
5775
}
5876

59-
#[pymodule]
60-
fn rust_syncode(_py: Python, m: &PyModule) -> PyResult<()> {
61-
m.add_function(wrap_pyfunction!(consume_prefix, m)?)?;
6277

63-
Ok(())
78+
/// Compute whether the string could match a sequence of terminals starting at a certain state in the first DFA.
79+
///
80+
/// Given a DFA D(Q, Σ, δ, q0, F ), a string w ∈ Σ∗, a DFA state q ∈ Q and any sequence of terminals Λ = {τf +1, τf +2 . . . τf +d}, dmatch(w, q, Λ) = true, if either of the following conditions hold:
81+
/// 1. δ∗(w, q) ∈ live(Q) or
82+
/// 2. ∃w1 ∈ Σ∗, w2 ∈ Σ+ such that w1.w2 = w, δ∗(w1, q) ∈ F and Λ = {} or
83+
/// 3. ∃w1 ∈ Σ∗, w2 ∈ Σ∗ such that w1.w2 = w, δ∗(w1, q) ∈ F, and dmatch(w2, qτf +10 , {τf +2 . . . τf +d}) = true where qτf +10 is the start state corresponding to the DFA for τf +1.
84+
///
85+
fn dmatch(string: &str, starting_state: DFAState, sequence_of_terminals: Vec<String>) -> bool {
86+
let dfa = dense::Builder::new().configure(dense::DFA::config().start_kind(dfa::StartKind::Anchored)).build(&starting_state.regex).unwrap();
87+
88+
// Case 1: the DFA, starting at this state, consumes the entire input and is still alive.
89+
let mut state = starting_state.state_id;
90+
for &b in string.as_bytes().iter() {
91+
state = dfa.next_state(state, b);
92+
}
93+
if !dfa.is_dead_state(state) {
94+
return true;
95+
}
96+
97+
// Case 2: The DFA consumes a prefix of the string, leaves a non-zero
98+
// suffix, and there is no sequence of terminals to follow.
99+
let mut state = starting_state.state_id;
100+
for (i, &b) in string.as_bytes().iter().enumerate() {
101+
state = dfa.next_state(state, b);
102+
if dfa.is_match_state(state) & sequence_of_terminals.is_empty() & (i < string.len()){
103+
return true;
104+
}
105+
}
106+
107+
// Case 3: A prefix of the string is successfully consumed by the DFA, and
108+
// dmatch is true starting at the next member of sequence_of_terminals.
109+
let mut state = starting_state.state_id;
110+
for (i, &b) in string.as_bytes().iter().enumerate() {
111+
state = dfa.next_state(state, b);
112+
if dfa.is_match_state(state) {
113+
let new_dfa = dense::Builder::new().configure(dense::DFA::config().start_kind(dfa::StartKind::Anchored)).build(&sequence_of_terminals[0]).unwrap();
114+
let new_starting_state = new_dfa.start_state(&start::Config::new().anchored(Anchored::Yes)).unwrap();
115+
let new_regex = sequence_of_terminals[0].clone();
116+
return dmatch(&string[i..], DFAState{regex: new_regex.into(), state_id: new_starting_state}, sequence_of_terminals[1..].to_vec());
117+
}
118+
}
119+
120+
// None of the previous cases succeeded, so dmatch is false.
121+
false
122+
}
123+
124+
125+
/// Return all states of a dfa by breadth-first search. There exists a private
126+
/// method that returns an iterator over all states. The suggested alternative
127+
/// is to traverse the graph manually. See
128+
/// https://github.com/rust-lang/regex/discussions/1223.
129+
fn states(dfa: &dense::DFA<Vec<u32>>) -> Vec<StateID> {
130+
let mut queue: VecDeque<StateID> = VecDeque::new();
131+
let mut explored: Vec<StateID> = Vec::new();
132+
133+
let start = dfa.start_state(&start::Config::new()).unwrap();
134+
135+
explored.push(start);
136+
queue.push_back(start);
137+
while !queue.is_empty() {
138+
let v = queue.pop_front().unwrap();
139+
if dfa.is_dead_state(v) {
140+
continue;
141+
}
142+
for letter in dfa.byte_classes().representatives(0..=255) {
143+
let next = dfa.next_state(start, letter.as_u8().unwrap());
144+
if !explored.contains(&next) {
145+
explored.push(next);
146+
}
147+
queue.push_back(next);
148+
}
149+
}
150+
explored
151+
}
152+
153+
154+
/// Compute the union of all states of a list of regexes.
155+
fn all_dfa_states(terminals: Vec<&str>) -> Vec<DFAState> {
156+
let mut res: Vec<DFAState> = Vec::new();
157+
for terminal in terminals {
158+
let dfa = dense::DFA::new(terminal).unwrap();
159+
let states = states(&dfa);
160+
for state in states {
161+
res.push(DFAState{regex: terminal.into(), state_id: state});
162+
}
163+
}
164+
res
165+
}
166+
167+
168+
// #[pymodule]
169+
// fn rust_syncode(_py: Python, m: &PyModule) -> PyResult<()> {
170+
// m.add_function(wrap_pyfunction!(consume_prefix, m)?)?;
171+
172+
// Ok(())
173+
// }
174+
175+
#[cfg(test)]
176+
mod tests {
177+
use super::*;
178+
179+
#[test]
180+
fn test_consume_prefix() {
181+
let re = r"[a-zA-Z_]\w*";
182+
let dfa = dense::DFA::new(re).unwrap();
183+
let result = consume_prefix(&dfa, "this_is_a_python_name");
184+
assert_eq!(result, (true, Some(String::new())));
185+
186+
let result = consume_prefix(&dfa, "this_is_a_python_name followed_by_other_stuff");
187+
assert_eq!(result, (true, Some(String::from(" followed_by_other_stuff"))));
188+
189+
let result = consume_prefix(&dfa, "this_is't_a_python_name");
190+
assert_eq!(result, (true, Some(String::from("'t_a_python_name"))));
191+
192+
let result = consume_prefix(&dfa, "'tai'nt_a_python_name");
193+
assert_eq!(result, (false, None));
194+
}
195+
196+
#[test]
197+
fn test_dmatch() {
198+
199+
}
64200
}

0 commit comments

Comments
 (0)