diff --git a/tamer/src/xir/parse/ele.rs b/tamer/src/xir/parse/ele.rs index c47a615b..73ae6ba3 100644 --- a/tamer/src/xir/parse/ele.rs +++ b/tamer/src/xir/parse/ele.rs @@ -50,13 +50,6 @@ pub struct EleParseCfg { pub repeat: bool, } -// This is an implementation detail for the internal state of EleParseState. -impl From for () { - fn from(_: EleParseCfg) -> Self { - () - } -} - /// Maximum level of nesting for source XML trees. /// /// Technically this is the maximum level of nesting for _parsing_ those @@ -252,15 +245,26 @@ macro_rules! ele_parse { $(type AttrValueError = $evty:ty;)? type Object = $objty:ty; - $($rest:tt)* + $( + [super] { + $($super_body:tt)* + }; + )? + + // Combination of square brackets above and the prefix here are + // needed for disambiguation. + $nt_first:ident := $($nt_defs:tt)* ) => { ele_parse! {@!next $vis $super $(type AttrValueError = $evty;)? type Object = $objty; - $($rest)* + $nt_first := $($nt_defs)* } - ele_parse!(@!super_sum <$objty> $vis $super $($rest)*); + ele_parse!(@!super_sum <$objty> $vis $super + $([super] { $($super_body)* })? + $nt_first := $($nt_defs)* + ); }; (@!next $vis:vis $super:ident @@ -423,11 +427,6 @@ macro_rules! ele_parse { // (defaulting to Incomplete via @!ele_expand_body). /$($close_span:ident)? => $closemap:expr, - // Non-whitespace text nodes can be mapped into elements with the - // given QName as a preprocessing step, - // allowing them to reuse the existing element NT system. - $([text]($text:ident, $text_span:ident) => $text_map:expr,)? - // Nonterminal references. <> { $( @@ -547,16 +546,60 @@ macro_rules! ele_parse { crate::xir::parse::NodeMatcher::from($qname) } - /// Yield the expected depth of child elements, - /// if known. - #[allow(dead_code)] // used by text special form - fn child_depth(&self) -> Option { + /// Whether the parser is in a state that can tolerate + /// superstate node preemption. + /// + /// For more information, + /// see the superstate + #[doc=concat!( + " [`", stringify!($super), "::can_preempt_node`]." + )] + fn can_preempt_node(&self) -> bool { + use $nt::*; + match self { - $ntfirst((_, _, _, depth)) => Some(depth.child_depth()), + // Preemption before the opening tag is safe, + // since we haven't started processing yet. + Expecting_(..) => true, + + // Preemption during recovery would cause tokens to + // be parsed when they ought to be ignored, + // so we must process all tokens during recovery. + RecoverEleIgnore_(..) + | CloseRecoverIgnore_(..) => false, + + // It is _not_ safe to preempt attribute parsing + // since attribute parsers aggregate until a + // non-attribute token is encountered; + // we must allow attribute parsing to finish its + // job _before_ any preempted nodes are emitted + // since the attributes came _before_ that node. + Attrs_(..) => false, + + // These states represent jump states where we're + // about to transition to the next child parser. + // It's safe to preempt here, + // since we're not in the middle of parsing. + // + // Note that this includes `ExpectClose_` because of + // the macro preprocessing, + // and Rust's exhaustiveness check will ensure + // that it is accounted for if that changes. + // If we're expecting that the next token is a + // `Close`, + // then it must be safe to preempt other nodes + // that may appear in this context as children. + $ntfirst(..) => true, $( - $ntnext((_, _, _, depth)) => Some(depth.child_depth()), + $ntnext(..) => true, )* - _ => None, + + // Preemption after closing is similar to preemption + // in `Expecting_`, + // in that we're effectively in the parent + // context. + RecoverEleIgnoreClosed_(..) + | Closed_(..) => true, } } } @@ -731,10 +774,6 @@ macro_rules! ele_parse { }, }; - // Used only by _some_ expansions. - #[allow(unused_imports)] - use crate::xir::flat::Text; - use $nt::{ Attrs_, Expecting_, RecoverEleIgnore_, CloseRecoverIgnore_, RecoverEleIgnoreClosed_, @@ -916,6 +955,38 @@ macro_rules! ele_parse { Done_, } + impl $nt { + /// Whether the parser is in a state that can tolerate + /// superstate node preemption. + /// + /// For more information, + /// see the superstate + #[doc=concat!( + " [`", stringify!($super), "::can_preempt_node`]." + )] + fn can_preempt_node(&self) -> bool { + use $nt::*; + + match self { + // Preemption before the opening tag is safe, + // since we haven't started processing yet. + Expecting_(..) => true, + + // Preemption during recovery would cause tokens to + // be parsed when they ought to be ignored, + // so we must process all tokens during recovery. + RecoverEleIgnore_(..) => false, + + // Preemption after closing is similar to preemption + // in `Expecting_`, + // in that we're effectively in the parent + // context. + RecoverEleIgnoreClosed_(..) + | Done_ => true, + } + } + } + impl std::fmt::Display for $nt { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { use crate::{ @@ -1122,6 +1193,14 @@ macro_rules! ele_parse { // and we have to do so in a way that we can aggregate all of // those data. (@!super_sum <$objty:ty> $vis:vis $super:ident + $( + [super] { + // Non-whitespace text nodes can be mapped into elements + // with the given QName as a preprocessing step, + // allowing them to reuse the existing element NT system. + $([text]($text:ident, $text_span:ident) => $text_map:expr,)? + } + )? $( // NT definition is always followed by `:=`. $nt:ident := @@ -1247,7 +1326,31 @@ macro_rules! ele_parse { xir::flat::{XirfToken, RefinedText}, }; + // Used only by _some_ expansions. + #[allow(unused_imports)] + use crate::xir::flat::Text; + match (self, tok) { + // [super] { + $( + // [text] preemption; + // see `Self::can_preempt_node`. + $( + ( + st, + XirfToken::Text( + RefinedText::Unrefined( + Text($text, $text_span) + ), + _, + ) + ) if st.can_preempt_node() => { + Transition(st).ok($text_map) + }, + )? + )? + // } + // Depth check is unnecessary since _all_ xir::parse // parsers // (at least at the time of writing) @@ -1258,7 +1361,6 @@ macro_rules! ele_parse { ( st, XirfToken::Text(RefinedText::Whitespace(..), _) - | XirfToken::Text(RefinedText::Unrefined(..), _) // XXX | XirfToken::Comment(..) ) => { Transition(st).incomplete() @@ -1317,6 +1419,40 @@ macro_rules! ele_parse { )* } } + + /// Whether the inner parser is in a state that can tolerate + /// superstate node preemption. + /// + /// Node preemption allows us (the superstate) to ask for + /// permission from the inner parser to parse some token + /// ourselves, + /// by asking whether the parser is in a state that + /// would cause semantic issues if we were to do so. + /// + /// For example, + /// if we were to preempt text nodes while an inner parser + /// was still parsing attributes, + /// then we would emit an object associated with that + /// text before the inner parser had a chance to + /// conclude that attribute parsing has completed and + /// emit the opening object for that node; + /// the result would otherwise be an incorrect + /// `Text, Open` instead of the correct `Open, Text`, + /// which would effectively unparent the text. + /// Similarly, + /// if we were to parse our own tokens while an inner + /// parser was performing error recovery in such a way as + /// to ignore all child tokens, + /// then we would emit an object in an incorrect + /// context. + #[allow(dead_code)] // TODO: Remove when using for tpl apply + fn can_preempt_node(&self) -> bool { + match self { + $( + Self::$nt(st) => st.can_preempt_node(), + )* + } + } } } }; diff --git a/tamer/src/xir/parse/ele/test.rs b/tamer/src/xir/parse/ele/test.rs index 798515f4..820878ea 100644 --- a/tamer/src/xir/parse/ele/test.rs +++ b/tamer/src/xir/parse/ele/test.rs @@ -1554,14 +1554,8 @@ fn sum_repetition() { } // Text nodes may appear between elements if a `[text]` special form -// specifies a mapping. +// specifies a mapping on the superstate. // This is "mixed content" in XML. -// -// The `[text]` mapping applies to all text nodes at the child depth of the -// element, -// meaning it'll preempt sum parser delegation to provide the desired -// behavior. -#[ignore] // TODO: ignoring text nodes for now; fix! #[test] fn mixed_content_text_nodes() { #[derive(Debug, PartialEq, Eq)] @@ -1569,8 +1563,7 @@ fn mixed_content_text_nodes() { Root, A, B, - TextRoot(SymbolId, Span), - TextA(SymbolId, Span), + Text(SymbolId, Span), } impl crate::parse::Object for Foo {} @@ -1583,14 +1576,17 @@ fn mixed_content_text_nodes() { enum Sut; type Object = Foo; + [super] { + // The `[text]` special form here introduces a `Text` mapping + // for all non-whitespace text nodes. + [text](sym, span) => Foo::Text(sym, span), + }; + Root := QN_SUT { @ {} => Foo::Root, - // The `[text]` special form here introduces a `Text` mapping - // for all non-whitespace text node at the same depth as our - // child elements. - [text](sym, span) => Foo::TextRoot(sym, span), - + // Text allowed at any point between these elements because of + // the `[super]` definition. A, AB[*], }; @@ -1598,10 +1594,8 @@ fn mixed_content_text_nodes() { A := QN_A { @ {} => Foo::A, - // The child should not have its text processing preempted. - [text](sym, span) => Foo::TextA(sym, span), - - // Text should be permitted even though we permit no children. + // Text should be permitted even though we permit no children, + // because of the `[super]` definition. }; // Used only for `AB`. @@ -1626,21 +1620,17 @@ fn mixed_content_text_nodes() { XirfToken::Open(QN_SUT, OpenSpan(S1, N), Depth(0)), // Whitespace will not match the `[text]` special form. tok_ws.clone(), - // Text node for the root (Root). + // Text before root open. + // This must be emitted as a _child_ of Root, + // meaning that Root must be given the opportunity to report that + // attribute parsing is finished before we emit the object. XirfToken::Text(RefinedText::Unrefined(Text(text_root, S1)), Depth(1)), - // This child also expects text nodes, - // and should be able to yield its own parse. XirfToken::Open(QN_A, OpenSpan(S2, N), Depth(1)), - // If this goes wrong, - // and Root does not properly check its depth, - // then the parser would erroneously yield `Foo::TextRoot` for - // this token. + // Text within a child. XirfToken::Text(RefinedText::Unrefined(Text(text_a, S2)), Depth(2)), XirfToken::Close(None, CloseSpan::empty(S3), Depth(1)), - // Now we're about to parse with `AB`, - // which itself cannot handle text. - // But text should never reach that parser, - // having been preempted by Root. + // Text _after_ a child node, + // which does not require ending attribute parsing before emitting. XirfToken::Text(RefinedText::Unrefined(Text(text_root, S3)), Depth(1)), // Try to yield A again with text. XirfToken::Open(QN_A, OpenSpan(S3, N), Depth(1)), @@ -1654,36 +1644,33 @@ fn mixed_content_text_nodes() { use Parsed::*; assert_eq!( Ok(vec![ - Incomplete, // [Root] Root Open - Incomplete, // [Root@] WS - Object(Foo::Root), // [Root@] Text (>LA) - Object(Foo::TextRoot(text_root, S1)), // [Root] Text (LA) - Object(Foo::TextA(text_a, S2)), // [A] Text (LA) - Object(Foo::TextA(text_a, S4)), // [A] Text (LA) + Object(Foo::Text(text_root, S1)), // [Root] Text (LA) + Object(Foo::Text(text_a, S2)), // [A] Text (LA) + Object(Foo::Text(text_a, S4)), // [A] Text ( Foo::Root, - // Mixed content permitted at root - // (but we won't be providing text for it in this test). - [text](sym, span) => Foo::TextRoot(sym, span), - - // But this child will not permit text. A, }; A := QN_A { @ {} => Foo::A, - - // Missing `[text`]; - // no mixed content permitted. }; } @@ -1719,9 +1699,7 @@ fn mixed_content_text_nodes_with_non_mixed_content_child() { let toks = vec![ XirfToken::Open(QN_SUT, OpenSpan(S1, N), Depth(0)), XirfToken::Open(QN_A, OpenSpan(S2, N), Depth(1)), - // Even though the root permits text, - // the child `A` does not, - // and so this should result in an error. + // Text should not be permitted. XirfToken::Text(RefinedText::Unrefined(Text(text_a, S2)), Depth(2)), XirfToken::Close(None, CloseSpan::empty(S3), Depth(1)), XirfToken::Close(Some(QN_SUT), CloseSpan(S6, N), Depth(0)), @@ -1738,11 +1716,9 @@ fn mixed_content_text_nodes_with_non_mixed_content_child() { assert_eq!(sut.next(), Some(Ok(Incomplete))); // [A] A Open (LA) - // The next token is `Text`, - // which is not expected by `A` and so should produce an error. - // The error that it produces at the time of writing is different than - // the error that it will eventually produce, - // so let's just expect some sort of error. + // The next token is text, + // which is not permitted because of a lack of `[super]` with + // `[text`]. assert_matches!(sut.next(), Some(Err(_))); // [A] Text (