diff --git a/Cargo.toml b/Cargo.toml index d868c00..9f50a3b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "asciidocr" -version = "0.1.13" +version = "0.1.14" readme = "README.md" license = "MIT" edition = "2024" diff --git a/src/graph/inlines.rs b/src/graph/inlines.rs index d7ff8af..ce7e3d7 100644 --- a/src/graph/inlines.rs +++ b/src/graph/inlines.rs @@ -156,7 +156,7 @@ impl Inline { pub fn extract_child_inlines(&mut self) -> VecDeque { match &self { - Inline::InlineSpan(span) => span.inlines.clone().into(), + Inline::InlineSpan(span) => span.extract_span_inlines(), _ => todo!(), } } @@ -385,14 +385,22 @@ impl InlineSpan { pub fn add_inline(&mut self, inline: Inline) { // update the locations self.location = Location::reconcile(self.location.clone(), inline.locations()); - // combine literals if necessary - if matches!(inline, Inline::InlineLiteral(_)) { - if let Some(Inline::InlineLiteral(prior_literal)) = self.inlines.last_mut() { - prior_literal.add_text_from_inline_literal(inline); - return; + if let Some(Inline::InlineSpan(last_span)) = self.inlines.last_mut() { + if last_span.open { + last_span.add_inline(inline); + } else { + self.inlines.push(inline); } + } else { + // combine literals if necessary + if matches!(inline, Inline::InlineLiteral(_)) { + if let Some(Inline::InlineLiteral(prior_literal)) = self.inlines.last_mut() { + prior_literal.add_text_from_inline_literal(inline); + return; + } + } + self.inlines.push(inline); } - self.inlines.push(inline); } fn new_footnote_ref(footnote_ref: InlineRef) -> Self { @@ -406,6 +414,34 @@ impl InlineSpan { footnote } + // extracts the inlines inside the span, closing any open (dangling) spans that may be + // inside of it + fn extract_span_inlines(&self) -> VecDeque { + let mut children = VecDeque::new(); + for inline in self.inlines.iter() { + // handle any open spans + if inline.is_open() { + let mut working_inline = inline.clone(); + let open_span_literal = working_inline.produce_literal_from_self(); + let mut inline_children = working_inline.extract_child_inlines(); + if let Some(inline) = inline_children.front_mut() { + match inline { + Inline::InlineLiteral(literal) => { + literal.prepend_to_value(open_span_literal, literal.location.clone()); + } + _ => todo!(), + } + } else { + todo!() + } + children.extend(inline_children); + } else { + children.push_back(inline.clone()); + } + } + children + } + /// Deconstructs a footnote span into the relevant footnote definition ID (to be applied to /// the leafblock that contains the footnote text), an InlineSpan `Sup` that replaces the footnote /// with a link to said leafblock, and the vector of inlines that will be inserted into diff --git a/src/parser.rs b/src/parser.rs index 7917475..b4b4a51 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -832,6 +832,18 @@ impl Parser { last_inline.close(); self.in_inline_span = false; return Ok(()); + } else if let Inline::InlineSpan(last_span) = last_inline { + if let Some(last_internal_inline) = last_span.inlines.last_mut() { + if inline == *last_internal_inline { + last_internal_inline.reconcile_locations(inline.locations()); + last_internal_inline.close(); + } else { + last_span.add_inline(inline); + } + } else { + last_span.add_inline(inline); + } + return Ok(()); } } // handle newline tokens prior to constrained spans @@ -1291,26 +1303,28 @@ impl Parser { _ => todo!(), } // put any appended inlines into the stack at the relevant position - for child_inline in children { - self.inline_stack.insert(open_span_idx, child_inline) + while children.len() > 0 { + if let Some(child) = children.pop_back() { + self.inline_stack.insert(open_span_idx, child); + } } // consolidate any resultant or remaining adjacent literals (this should be extracted to a function) - let mut temp_stack: VecDeque = VecDeque::new(); - let mut inline_stack_iter = self.inline_stack.iter_mut().peekable(); - while inline_stack_iter.peek().is_some() { - if let Some(current) = inline_stack_iter.next() { - if let Inline::InlineLiteral(current_literal) = current { - if let Some(Inline::InlineLiteral(next_literal)) = - inline_stack_iter.next() - { - current_literal.combine_literals(next_literal.clone()); - } + let mut temp_stack: Vec = vec![]; + while let Some(mut inline) = self.inline_stack.pop_front() { + if temp_stack.len() == 0 { + temp_stack.push(inline); + } else if inline.is_literal() { + if let Some(Inline::InlineLiteral(last_in_stack)) = temp_stack.last_mut() { + last_in_stack.combine_literals(inline.extract_literal()); + } else { + temp_stack.push(inline); } - temp_stack.push_back(current.clone()); + } else { + temp_stack.push(inline); } } - self.inline_stack = temp_stack; + self.inline_stack = temp_stack.into(); } else { // ... or if there are no children, add the token to the back of the last one; this // is a little hacky, but it is cleaner compared to the rest of the code just to diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs index bbd2100..c756673 100644 --- a/src/scanner/mod.rs +++ b/src/scanner/mod.rs @@ -715,21 +715,29 @@ impl<'a> Scanner<'a> { constrained: TokenType, unconstrained: TokenType, ) -> Result { + let inline_markup_chars = ['*', '_', '`', '+', '^', '~', '#']; + let mut end_of_inline_markers = vec![ + ' ', '\0', '.', ',', ';', ':', '\n', ')', '"', '!', '?', '\'', ']', '…', '“', '”', '‘', + '’', + ]; + let mut beginning_of_inline_markers = vec![' ', '\n', '\0', ']', '(', '"', '[']; + end_of_inline_markers.extend_from_slice(&inline_markup_chars); + beginning_of_inline_markers.extend_from_slice(&inline_markup_chars); // guard clause against dangling markup if self.peek() == ' ' && self.peek_back() == ' ' { self.add_text_until_next_markup() - } else if [ - ' ', '\0', '.', ',', ';', ':', '\n', ')', '"', '!', '?', '\'', ']', '…', '“', '”', '‘', - '’', - ] - .contains(&self.peek()) - || [' ', '\n', '\0', ']', '(', '"', '['].contains(&self.peek_back()) && self.peek() != c - { - self.add_token(constrained, false, 0) } else if self.peek() == c { - // we've got an unconstrained version + // the next character is the same + // we've got an unconstrained (i.e., "**foo**bar") version self.current += 1; self.add_token(unconstrained, false, 0) + } + // we're at the end of a span, or are butted up against another inline marker + else if end_of_inline_markers.contains(&self.peek()) || + // or we're at the beginning, or butted up against another inline marker + beginning_of_inline_markers.contains(&self.peek_back()) && self.peek() != c + { + self.add_token(constrained, false, 0) } else { self.add_text_until_next_markup() } @@ -1282,6 +1290,48 @@ mod tests { scan_and_assert_eq(&markup, expected_tokens); } + #[rstest] + #[case('*', TokenType::Strong)] + #[case('`', TokenType::Monospace)] + #[case('+', TokenType::Literal)] + #[case('^', TokenType::Superscript)] + #[case('~', TokenType::Subscript)] + #[case('#', TokenType::Mark)] + fn inline_formatting_by_other(#[case] markup_char: char, #[case] expected_token: TokenType) { + let markup = format!("Somx {}_bar_{} bar.", markup_char, markup_char); + let expected_tokens = vec![ + Token::new_default( + TokenType::Text, + "Somx ".to_string(), + Some("Somx ".to_string()), + 1, + 1, + 5, + ), + Token::new_default(expected_token, markup_char.to_string(), None, 1, 6, 6), + Token::new_default(TokenType::Emphasis, "_".to_string(), None, 1, 7, 7), + Token::new_default( + TokenType::Text, + "bar".to_string(), + Some("bar".to_string()), + 1, + 8, + 10, + ), + Token::new_default(TokenType::Emphasis, "_".to_string(), None, 1, 11, 11), + Token::new_default(expected_token, markup_char.to_string(), None, 1, 12, 12), + Token::new_default( + TokenType::Text, + " bar.".to_string(), + Some(" bar.".to_string()), + 1, + 13, + markup.len(), + ), + ]; + scan_and_assert_eq(&markup, expected_tokens); + } + #[rstest] #[case('*')] #[case('_')] diff --git a/src/scanner/tokens.rs b/src/scanner/tokens.rs index d8069bc..1057bc2 100644 --- a/src/scanner/tokens.rs +++ b/src/scanner/tokens.rs @@ -280,7 +280,7 @@ pub enum TokenType { // garden-variety text Hyperlink, // http://whatever.txt - Email, // cats@dogs.foo + Email, // cats@dogs.foo Text, // character reference, such as "—" diff --git a/tests/data/inlines/span-inside-spans-mixed.adoc b/tests/data/inlines/span-inside-spans-mixed.adoc new file mode 100644 index 0000000..cc9cfa1 --- /dev/null +++ b/tests/data/inlines/span-inside-spans-mixed.adoc @@ -0,0 +1 @@ +*_This_ shouldn't be a problem!* diff --git a/tests/data/inlines/span-inside-spans-mixed.json b/tests/data/inlines/span-inside-spans-mixed.json new file mode 100644 index 0000000..15ff2dc --- /dev/null +++ b/tests/data/inlines/span-inside-spans-mixed.json @@ -0,0 +1,113 @@ +{ + "name": "document", + "type": "block", + "blocks": [ + { + "name": "paragraph", + "type": "block", + "inlines": [ + { + "name": "span", + "type": "inline", + "variant": "strong", + "form": "constrained", + "inlines": [ + { + "name": "span", + "type": "inline", + "variant": "emphasis", + "form": "constrained", + "inlines": [ + { + "name": "text", + "type": "string", + "value": "This", + "location": [ + { + "line": 1, + "col": 3 + }, + { + "line": 1, + "col": 6 + } + ] + } + ], + "location": [ + { + "line": 1, + "col": 2 + }, + { + "line": 1, + "col": 7 + } + ] + }, + { + "name": "text", + "type": "string", + "value": " shouldn't be a problem!", + "location": [ + { + "line": 1, + "col": 8 + }, + { + "line": 1, + "col": 31 + } + ] + } + ], + "location": [ + { + "line": 1, + "col": 1 + }, + { + "line": 1, + "col": 32 + } + ] + }, + { + "name": "text", + "type": "string", + "value": " ", + "location": [ + { + "line": 1, + "col": 33 + }, + { + "line": 1, + "col": 33 + } + ] + } + ], + "location": [ + { + "line": 1, + "col": 1 + }, + { + "line": 1, + "col": 33 + } + ] + } + ], + "location": [ + { + "line": 1, + "col": 1 + }, + { + "line": 1, + "col": 33 + } + ] +} \ No newline at end of file diff --git a/tests/data/inlines/span-inside-spans.adoc b/tests/data/inlines/span-inside-spans.adoc new file mode 100644 index 0000000..51e1ae0 --- /dev/null +++ b/tests/data/inlines/span-inside-spans.adoc @@ -0,0 +1,5 @@ +*_foo_* + +**_foo_** + +**__foo__** diff --git a/tests/data/inlines/span-inside-spans.json b/tests/data/inlines/span-inside-spans.json new file mode 100644 index 0000000..f6e5f50 --- /dev/null +++ b/tests/data/inlines/span-inside-spans.json @@ -0,0 +1,217 @@ +{ + "name": "document", + "type": "block", + "blocks": [ + { + "name": "paragraph", + "type": "block", + "inlines": [ + { + "name": "span", + "type": "inline", + "variant": "strong", + "form": "constrained", + "inlines": [ + { + "name": "span", + "type": "inline", + "variant": "emphasis", + "form": "constrained", + "inlines": [ + { + "name": "text", + "type": "string", + "value": "foo", + "location": [ + { + "line": 1, + "col": 3 + }, + { + "line": 1, + "col": 5 + } + ] + } + ], + "location": [ + { + "line": 1, + "col": 2 + }, + { + "line": 1, + "col": 6 + } + ] + } + ], + "location": [ + { + "line": 1, + "col": 1 + }, + { + "line": 1, + "col": 7 + } + ] + } + ], + "location": [ + { + "line": 1, + "col": 1 + }, + { + "line": 1, + "col": 7 + } + ] + }, + { + "name": "paragraph", + "type": "block", + "inlines": [ + { + "name": "span", + "type": "inline", + "variant": "strong", + "form": "unconstrained", + "inlines": [ + { + "name": "span", + "type": "inline", + "variant": "emphasis", + "form": "constrained", + "inlines": [ + { + "name": "text", + "type": "string", + "value": "foo", + "location": [ + { + "line": 3, + "col": 4 + }, + { + "line": 3, + "col": 6 + } + ] + } + ], + "location": [ + { + "line": 3, + "col": 3 + }, + { + "line": 3, + "col": 7 + } + ] + } + ], + "location": [ + { + "line": 3, + "col": 1 + }, + { + "line": 3, + "col": 9 + } + ] + } + ], + "location": [ + { + "line": 3, + "col": 1 + }, + { + "line": 3, + "col": 9 + } + ] + }, + { + "name": "paragraph", + "type": "block", + "inlines": [ + { + "name": "span", + "type": "inline", + "variant": "strong", + "form": "unconstrained", + "inlines": [ + { + "name": "span", + "type": "inline", + "variant": "emphasis", + "form": "unconstrained", + "inlines": [ + { + "name": "text", + "type": "string", + "value": "foo", + "location": [ + { + "line": 5, + "col": 5 + }, + { + "line": 5, + "col": 7 + } + ] + } + ], + "location": [ + { + "line": 5, + "col": 3 + }, + { + "line": 5, + "col": 9 + } + ] + } + ], + "location": [ + { + "line": 5, + "col": 1 + }, + { + "line": 5, + "col": 11 + } + ] + } + ], + "location": [ + { + "line": 5, + "col": 1 + }, + { + "line": 5, + "col": 11 + } + ] + } + ], + "location": [ + { + "line": 1, + "col": 1 + }, + { + "line": 5, + "col": 11 + } + ] +} \ No newline at end of file diff --git a/tests/inline_tests.rs b/tests/inline_tests.rs index 0c4a3ba..aae12d6 100644 --- a/tests/inline_tests.rs +++ b/tests/inline_tests.rs @@ -36,6 +36,24 @@ fn test_spans_with_chars_between(#[case] markup_char: &str, #[case] variant: &st assert_parsed_doc_matches_expected_asg_from_str(&adoc_str, &asg_json_str) } +#[test] +fn test_spans_inside_spans() { + let adoc_str = fs::read_to_string("tests/data/inlines/span-inside-spans.adoc") + .expect("Unable to read asciidoc test template"); + let asg_json_str = fs::read_to_string("tests/data/inlines/span-inside-spans.json") + .expect("Unable to read asg json test template"); + assert_parsed_doc_matches_expected_asg_from_str(&adoc_str, &asg_json_str) +} + +#[test] +fn test_spans_inside_spans_mixed() { + let adoc_str = fs::read_to_string("tests/data/inlines/span-inside-spans-mixed.adoc") + .expect("Unable to read asciidoc test template"); + let asg_json_str = fs::read_to_string("tests/data/inlines/span-inside-spans-mixed.json") + .expect("Unable to read asg json test template"); + assert_parsed_doc_matches_expected_asg_from_str(&adoc_str, &asg_json_str) +} + #[rstest] #[case::emphasis("_")] #[case::strong("*")]