diff --git a/acdc-cli/CHANGELOG.md b/acdc-cli/CHANGELOG.md index 4e70649f..02183c4a 100644 --- a/acdc-cli/CHANGELOG.md +++ b/acdc-cli/CHANGELOG.md @@ -24,6 +24,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 source location via inline anchors. This goes beyond asciidoctor's HTML backend which leaves index sections empty. The index only renders when it's the last section in the document. +- **Semantic HTML5 backend** — `--backend html5s` produces semantic HTML5 output with proper + elements and ARIA roles instead of div-based layout. ([#329]) ### Fixed @@ -52,6 +54,7 @@ This is tagged but unreleased in crates.io for now. [#272]: https://github.com/nlopes/acdc/issues/272 [#273]: https://github.com/nlopes/acdc/issues/273 [#311]: https://github.com/nlopes/acdc/issues/311 +[#329]: https://github.com/nlopes/acdc/issues/329 [Unreleased]: https://github.com/nlopes/acdc/compare/acdc-cli-v0.1.0...HEAD [0.1.0]: https://github.com/nlopes/acdc/releases/tag/acdc-cli-v0.1.0 diff --git a/acdc-cli/src/subcommands/convert.rs b/acdc-cli/src/subcommands/convert.rs index 51263142..5eba4746 100644 --- a/acdc-cli/src/subcommands/convert.rs +++ b/acdc-cli/src/subcommands/convert.rs @@ -148,20 +148,18 @@ pub fn run(args: &Args) -> miette::Result<()> { .timings(args.timings) .embedded(args.embedded) .output_destination(output_destination) + .backend(args.backend) .build(); match args.backend { #[cfg(feature = "html")] - Backend::Html => { - // HTML can process files in parallel - each file writes to separate output - run_processor::( - args, - options, - document_attributes, - true, - ) - .map_err(|e| error::display(&e)) - } + Backend::Html | Backend::Html5s => run_processor::( + args, + options, + document_attributes, + true, + ) + .map_err(|e| error::display(&e)), #[cfg(feature = "terminal")] Backend::Terminal => { diff --git a/acdc-parser/CHANGELOG.md b/acdc-parser/CHANGELOG.md index 20ac4222..abf96cb0 100644 --- a/acdc-parser/CHANGELOG.md +++ b/acdc-parser/CHANGELOG.md @@ -5,6 +5,18 @@ All notable changes to `acdc-parser` will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Fixed + +- **Bare autolinks no longer capture trailing punctuation** — URLs like + `https://example.com.` now correctly exclude the trailing `.` from the link target. + A new `bare_url()` rule with balanced parenthesis handling ensures sentence-level + punctuation (`.`, `,`, `;`, `!`, `?`, `:`) and surrounding parens are not consumed. +- **URL macro display text no longer produces nested autolinks** — display text in + `http://example.com[http://example.com]` is now parsed with autolinks suppressed, + preventing the inner URL from being double-linked. + ## [0.4.0] - 2026-02-07 ### Fixed diff --git a/acdc-parser/fixtures/tests/url_macro_url_display_text.adoc b/acdc-parser/fixtures/tests/url_macro_url_display_text.adoc new file mode 100644 index 00000000..182f6aea --- /dev/null +++ b/acdc-parser/fixtures/tests/url_macro_url_display_text.adoc @@ -0,0 +1 @@ +http://example.com/test1[http://example.com/test1] diff --git a/acdc-parser/fixtures/tests/url_macro_url_display_text.json b/acdc-parser/fixtures/tests/url_macro_url_display_text.json new file mode 100644 index 00000000..a443a72c --- /dev/null +++ b/acdc-parser/fixtures/tests/url_macro_url_display_text.json @@ -0,0 +1,52 @@ +{ + "name": "document", + "type": "block", + "blocks": [ + { + "name": "paragraph", + "type": "block", + "inlines": [ + { + "name": "ref", + "type": "inline", + "variant": "link", + "target": { + "type": "url", + "value": "http://example.com/test1" + }, + "location": [ + { + "line": 1, + "col": 1 + }, + { + "line": 1, + "col": 50 + } + ], + "attributes": {} + } + ], + "location": [ + { + "line": 1, + "col": 1 + }, + { + "line": 1, + "col": 50 + } + ] + } + ], + "location": [ + { + "line": 1, + "col": 1 + }, + { + "line": 1, + "col": 50 + } + ] +} \ No newline at end of file diff --git a/acdc-parser/src/grammar/document.rs b/acdc-parser/src/grammar/document.rs index fbe8f7a9..d8b00aad 100644 --- a/acdc-parser/src/grammar/document.rs +++ b/acdc-parser/src/grammar/document.rs @@ -17,6 +17,7 @@ use crate::{ inline_preprocessor::InlinePreprocessorParserState, inline_processing::{ adjust_and_log_parse_error, parse_inlines, preprocess_inline_content, process_inlines, + process_inlines_no_autolinks, }, location_mapping::map_inline_locations, manpage::{ @@ -3555,9 +3556,12 @@ peg::parser! { } pub(crate) rule inlines(offset: usize, block_metadata: &BlockParsingMetadata) -> Vec - = (non_plain_text(offset, block_metadata) / plain_text(offset, block_metadata))+ + = (non_plain_text(offset, block_metadata, true) / plain_text(offset, block_metadata, true))+ - rule non_plain_text(offset: usize, block_metadata: &BlockParsingMetadata) -> InlineNode + pub(crate) rule inlines_no_autolinks(offset: usize, block_metadata: &BlockParsingMetadata) -> Vec + = (non_plain_text(offset, block_metadata, false) / plain_text(offset, block_metadata, false))+ + + rule non_plain_text(offset: usize, block_metadata: &BlockParsingMetadata, allow_autolinks: bool) -> InlineNode = inline:( // Escaped superscript/subscript must come first - produces RawText to prevent re-parsing escaped_super_sub:escaped_superscript_subscript(offset) { escaped_super_sub } @@ -3587,7 +3591,7 @@ peg::parser! { / url_macro:url_macro(offset, block_metadata) { url_macro } / pass:inline_pass(offset) { pass } / link_macro:link_macro(offset) { link_macro } - / inline_autolink:inline_autolink(offset) { inline_autolink } + / check_autolinks(allow_autolinks) inline_autolink:inline_autolink(offset) { inline_autolink } / inline_line_break:inline_line_break(offset) { inline_line_break } / bold_text_unconstrained:bold_text_unconstrained(offset, block_metadata) { bold_text_unconstrained } / bold_text_constrained:bold_text_constrained(offset, block_metadata) { bold_text_constrained } @@ -4000,7 +4004,7 @@ peg::parser! { } } let text = if let Some(text) = text { - process_inlines(state, block_metadata, &start, end, offset, &text) + process_inlines_no_autolinks(state, block_metadata, &start, end, offset, &text) .map_err(|e| { tracing::error!(?e, url_text = text, "could not process URL macro text"); "could not process URL macro text" @@ -4048,7 +4052,7 @@ peg::parser! { } } let text = if let Some(text) = text { - process_inlines(state, block_metadata, &start, end, offset, &text) + process_inlines_no_autolinks(state, block_metadata, &start, end, offset, &text) .map_err(|e| { tracing::error!(?e, url_text = text, "could not process mailto macro text"); "could not process mailto macro text" @@ -4065,12 +4069,16 @@ peg::parser! { }))) } + rule check_autolinks(allow: bool) -> () + = {? if allow { Ok(()) } else { Err("autolinks suppressed") } } + rule inline_autolink(offset: usize) -> InlineNode - = start:position!() + = + start:position!() url_info:( "<" url:url() ">" { (url, true) } / "<" url:email_address() ">" { (format!("mailto:{url}"), true) } - / url:url() { (url, false) } + / url:bare_url() { (url, false) } / url:email_address() { (format!("mailto:{url}"), false) } ) end:position!() @@ -4299,7 +4307,7 @@ peg::parser! { if trimmed.is_empty() { vec![] } else { - process_inlines(state, block_metadata, &start, end, offset, trimmed) + process_inlines_no_autolinks(state, block_metadata, &start, end, offset, trimmed) .map_err(|e| { tracing::error!(?e, xref_text = trimmed, "could not process xref text"); "could not process xref text" @@ -4331,7 +4339,7 @@ peg::parser! { let text = if raw_text.is_empty() { vec![] } else { - process_inlines(state, block_metadata, &start, end, offset, raw_text) + process_inlines_no_autolinks(state, block_metadata, &start, end, offset, raw_text) .map_err(|e| { tracing::error!(?e, xref_text = raw_text, "could not process xref text"); "could not process xref text" @@ -4860,14 +4868,14 @@ peg::parser! { })) } - rule plain_text(offset: usize, block_metadata: &BlockParsingMetadata) -> InlineNode + rule plain_text(offset: usize, block_metadata: &BlockParsingMetadata, allow_autolinks: bool) -> InlineNode = start_pos:position!() content:$(( // Escape sequences for superscript/subscript markers - only when NOT followed by // a complete pattern (those are handled by escaped_superscript_subscript rule) "\\" "^" !([^'^' | ' ' | '\t' | '\n']+ "^") / "\\" "~" !([^'~' | ' ' | '\t' | '\n']+ "~") - / (!(eol()*<2,> / ![_] / escaped_syntax_match() / index_term_match() / inline_anchor_match() / cross_reference_shorthand_match() / cross_reference_macro_match() / hard_wrap(offset) / footnote_match(offset, block_metadata) / inline_image(start_pos, block_metadata) / inline_icon(start_pos, block_metadata) / inline_stem(start_pos) / inline_keyboard(start_pos) / inline_button(start_pos) / inline_menu(start_pos) / mailto_macro(start_pos, block_metadata) / url_macro(start_pos, block_metadata) / inline_pass(start_pos) / link_macro(start_pos) / inline_autolink(start_pos) / inline_line_break(start_pos) / bold_text_unconstrained(start_pos, block_metadata) / bold_text_constrained_match() / italic_text_unconstrained(start_pos, block_metadata) / italic_text_constrained_match() / monospace_text_unconstrained(start_pos, block_metadata) / monospace_text_constrained_match() / highlight_text_unconstrained(start_pos, block_metadata) / highlight_text_constrained_match() / superscript_text(start_pos, block_metadata) / subscript_text(start_pos, block_metadata) / curved_quotation_text(start_pos, block_metadata) / curved_apostrophe_text(start_pos, block_metadata) / standalone_curved_apostrophe(start_pos, block_metadata)) [_]) + / (!(eol()*<2,> / ![_] / escaped_syntax_match() / index_term_match() / inline_anchor_match() / cross_reference_shorthand_match() / cross_reference_macro_match() / hard_wrap(offset) / footnote_match(offset, block_metadata) / inline_image(start_pos, block_metadata) / inline_icon(start_pos, block_metadata) / inline_stem(start_pos) / inline_keyboard(start_pos) / inline_button(start_pos) / inline_menu(start_pos) / mailto_macro(start_pos, block_metadata) / url_macro(start_pos, block_metadata) / inline_pass(start_pos) / link_macro(start_pos) / (check_autolinks(allow_autolinks) inline_autolink(start_pos)) / inline_line_break(start_pos) / bold_text_unconstrained(start_pos, block_metadata) / bold_text_constrained_match() / italic_text_unconstrained(start_pos, block_metadata) / italic_text_constrained_match() / monospace_text_unconstrained(start_pos, block_metadata) / monospace_text_constrained_match() / highlight_text_unconstrained(start_pos, block_metadata) / highlight_text_constrained_match() / superscript_text(start_pos, block_metadata) / subscript_text(start_pos, block_metadata) / curved_quotation_text(start_pos, block_metadata) / curved_apostrophe_text(start_pos, block_metadata) / standalone_curved_apostrophe(start_pos, block_metadata)) [_]) )+) end:position!() { @@ -5590,6 +5598,63 @@ peg::parser! { Ok(strip_url_backslash_escapes(&processed.text)) } + /// URL for bare autolinks — avoids capturing trailing sentence punctuation + /// (., ;, !, etc.) by only consuming punctuation when more URL chars follow. + rule bare_url() -> String = + proto:$("https" / "http" / "ftp" / "irc") "://" path:bare_url_path() + { format!("{proto}://{path}") } + + /// URL path for bare autolinks. Like url_path() but: + /// - Trailing punctuation (. , ; ! ? : ' *) only consumed when followed by more URL chars. + /// - `)` only consumed as part of a balanced `(...)` group, preventing capture of + /// sentence-level parens like `(see http://example.com)`. + rule bare_url_path() -> String = path:$( + bare_url_safe_char() + ( bare_url_safe_char() + / bare_url_paren_group() + / "(" + / bare_url_trailing_char() &bare_url_char() + )* + ) + {? + let inline_state = InlinePreprocessorParserState::new( + path, + state.line_map.clone(), + &state.input, + ); + let processed = inline_preprocessing::run(path, &state.document_attributes, &inline_state) + .map_err(|e| { + tracing::error!(?e, "could not preprocess bare url path"); + "could not preprocess bare url path" + })?; + for warning in inline_state.drain_warnings() { + state.add_warning(warning); + } + Ok(strip_url_backslash_escapes(&processed.text)) + } + + /// Balanced parenthesized group in a URL path. + /// Handles nested parens: `http://example.com/wiki/Foo_(bar_(baz))` + /// Only `)` consumed via this rule — unbalanced `)` is never captured. + rule bare_url_paren_group() + = "(" (bare_url_safe_char() / bare_url_trailing_char() / bare_url_paren_group() / "(")* ")" + + /// URL chars that are safe to end a bare URL — won't be confused with sentence punctuation. + /// Excludes `(` and `)` which are handled separately via `bare_url_paren_group`. + rule bare_url_safe_char() = ['A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '~' + | '/' | '#' | '@' | '$' | '&' + | '+' | '=' | '%' | '\\'] + + /// URL chars that are valid mid-URL but should not end a bare URL. + /// Excludes `)` which is only consumed via balanced `bare_url_paren_group`. + rule bare_url_trailing_char() = ['.' | ',' | ';' | '!' | '?' | ':' | '\'' | '*'] + + /// Any valid URL path char (for lookahead in trailing char rule). + /// Includes `(` because it can start a paren group. + /// Excludes `)` so that trailing chars before `)` aren't greedily consumed + /// (e.g., `http://example.com.)` keeps both `.` and `)` outside). + rule bare_url_char() = bare_url_safe_char() / bare_url_trailing_char() / "(" + /// Filesystem path - conservative character set for cross-platform compatibility /// Includes '{' and '}' for `AsciiDoc` attribute substitution pub rule path() -> String = path:$(['A'..='Z' | 'a'..='z' | '0'..='9' | '{' | '}' | '_' | '-' | '.' | '/' | '\\' ]+) diff --git a/acdc-parser/src/grammar/inline_processing.rs b/acdc-parser/src/grammar/inline_processing.rs index f6fbdec5..277de9d7 100644 --- a/acdc-parser/src/grammar/inline_processing.rs +++ b/acdc-parser/src/grammar/inline_processing.rs @@ -137,6 +137,38 @@ pub(crate) fn parse_inlines( Ok(inlines) } +#[tracing::instrument(skip_all, fields(processed=?processed, block_metadata=?block_metadata))] +pub(crate) fn parse_inlines_no_autolinks( + processed: &ProcessedContent, + state: &mut ParserState, + block_metadata: &BlockParsingMetadata, + location: &Location, +) -> Result, Error> { + let mut inline_peg_state = ParserState::new(&processed.text); + inline_peg_state.document_attributes = state.document_attributes.clone(); + inline_peg_state.footnote_tracker = state.footnote_tracker.clone(); + + let inlines = match document_parser::inlines_no_autolinks( + &processed.text, + &mut inline_peg_state, + 0, + block_metadata, + ) { + Ok(inlines) => inlines, + Err(err) => { + return Err(adjust_peg_error_position( + &err, + &processed.text, + location.absolute_start, + state, + )); + } + }; + + state.footnote_tracker = inline_peg_state.footnote_tracker.clone(); + Ok(inlines) +} + /// Process inlines /// /// This function processes inline content by first preprocessing it and then parsing it @@ -161,3 +193,25 @@ pub(crate) fn process_inlines( let content = parse_inlines(&processed, state, block_metadata, &location)?; super::location_mapping::map_inline_locations(state, &processed, &content, &location) } + +/// Process inlines with autolinks suppressed. +/// +/// Used inside URL macros, mailto macros, and cross-references where nested +/// autolinks would cause incorrect parsing. +#[tracing::instrument(skip_all, fields(?content_start, end, offset))] +pub(crate) fn process_inlines_no_autolinks( + state: &mut ParserState, + block_metadata: &BlockParsingMetadata, + content_start: &PositionWithOffset, + end: usize, + offset: usize, + content: &str, +) -> Result, Error> { + let (location, processed) = + preprocess_inline_content(state, content_start, end, offset, content)?; + if processed.text.trim().is_empty() { + return Ok(Vec::new()); + } + let content = parse_inlines_no_autolinks(&processed, state, block_metadata, &location)?; + super::location_mapping::map_inline_locations(state, &processed, &content, &location) +} diff --git a/converters/core/CHANGELOG.md b/converters/core/CHANGELOG.md index 798c7ba1..5b6d7c64 100644 --- a/converters/core/CHANGELOG.md +++ b/converters/core/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Comprehensive module-level documentation - `acdc-converters-dev` crate for test utilities (not published to crates.io) - Visitor method `visit_callout_ref` for processing callout references +- `Backend::Html5s` variant for semantic HTML5 output ### Fixed diff --git a/converters/core/src/backend.rs b/converters/core/src/backend.rs index 3a61be52..d1e603ca 100644 --- a/converters/core/src/backend.rs +++ b/converters/core/src/backend.rs @@ -12,6 +12,8 @@ pub enum Backend { /// HTML output format. #[default] Html, + /// Semantic HTML5 output format (html5s). + Html5s, /// Unix manpage (roff/troff) output format. Manpage, /// Terminal/console output with ANSI formatting. @@ -24,10 +26,11 @@ impl FromStr for Backend { fn from_str(s: &str) -> Result { match s.to_lowercase().as_str() { "html" => Ok(Self::Html), + "html5s" => Ok(Self::Html5s), "manpage" => Ok(Self::Manpage), "terminal" => Ok(Self::Terminal), _ => Err(format!( - "invalid backend: '{s}', expected: html, manpage, terminal" + "invalid backend: '{s}', expected: html, html5s, manpage, terminal" )), } } @@ -37,6 +40,7 @@ impl std::fmt::Display for Backend { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Html => write!(f, "html"), + Self::Html5s => write!(f, "html5s"), Self::Manpage => write!(f, "manpage"), Self::Terminal => write!(f, "terminal"), } @@ -52,6 +56,8 @@ mod tests { fn test_from_str() { assert_eq!(Backend::from_str("html").unwrap(), Backend::Html); assert_eq!(Backend::from_str("HTML").unwrap(), Backend::Html); + assert_eq!(Backend::from_str("html5s").unwrap(), Backend::Html5s); + assert_eq!(Backend::from_str("HTML5S").unwrap(), Backend::Html5s); assert_eq!(Backend::from_str("manpage").unwrap(), Backend::Manpage); assert_eq!(Backend::from_str("terminal").unwrap(), Backend::Terminal); assert!(Backend::from_str("invalid").is_err()); @@ -60,6 +66,7 @@ mod tests { #[test] fn test_display() { assert_eq!(Backend::Html.to_string(), "html"); + assert_eq!(Backend::Html5s.to_string(), "html5s"); assert_eq!(Backend::Manpage.to_string(), "manpage"); assert_eq!(Backend::Terminal.to_string(), "terminal"); } diff --git a/converters/core/src/lib.rs b/converters/core/src/lib.rs index 784ef323..d57f3041 100644 --- a/converters/core/src/lib.rs +++ b/converters/core/src/lib.rs @@ -169,6 +169,7 @@ pub struct Options { embedded: bool, /// Output destination for conversion. output_destination: OutputDestination, + backend: Backend, } impl Options { @@ -218,6 +219,12 @@ impl Options { pub fn output_destination(&self) -> &OutputDestination { &self.output_destination } + + /// Get the backend type. + #[must_use] + pub fn backend(&self) -> Backend { + self.backend + } } /// Builder for [`Options`]. @@ -231,6 +238,7 @@ pub struct OptionsBuilder { timings: bool, embedded: bool, output_destination: OutputDestination, + backend: Backend, } impl OptionsBuilder { @@ -281,6 +289,13 @@ impl OptionsBuilder { self } + /// Set the backend type. + #[must_use] + pub fn backend(mut self, backend: Backend) -> Self { + self.backend = backend; + self + } + /// Build the [`Options`] instance. #[must_use] pub fn build(self) -> Options { @@ -291,6 +306,7 @@ impl OptionsBuilder { timings: self.timings, embedded: self.embedded, output_destination: self.output_destination, + backend: self.backend, } } } diff --git a/converters/html/CHANGELOG.md b/converters/html/CHANGELOG.md index c2a19ac7..d0ef895d 100644 --- a/converters/html/CHANGELOG.md +++ b/converters/html/CHANGELOG.md @@ -9,6 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- **Semantic HTML5 backend (`html5s`)** — new `--backend html5s` option produces semantic HTML5 + using `
`, `