tamer: xir::parse::ele: Superstate text node preemption

This introduces the concept of superstate node preemption generally, which I hope to use for template application as well, since templates can appear in essentially any (syntatically valid, for XML) position. This implements mixed content handling by defining the mapping on the superstate itself, which really simplifies the problem but foregoes fine-grained text handling. I had hoped to avoid that, but oh well. This pushes the responsibility of whether text is semantically valid at that position to NIR->AIR lowering (which we're not transition to yet), which is really the better place for it anyway, since this is just the grammar. The lowering to AIR will need to validate anyway given that template expansion happens after NIR. Moving on! DEV-7145
2022-08-16 10:42:06 -04:00 · 2022-08-16 10:42:06 -04:00 · 6f53c0971b
parent 65b42022f0
commit 6f53c0971b
2 changed files with 203 additions and 91 deletions
--- a/tamer/src/xir/parse/ele.rs
+++ b/tamer/src/xir/parse/ele.rs
@ -50,13 +50,6 @@ pub struct EleParseCfg {
    pub repeat: bool,
 }

-// This is an implementation detail for the internal state of EleParseState.
-impl From<EleParseCfg> for () {
-    fn from(_: EleParseCfg) -> Self {
-        ()
-    }
-}
-
 /// Maximum level of nesting for source XML trees.
 ///
 /// Technically this is the maximum level of nesting for _parsing_ those
@ -252,15 +245,26 @@ macro_rules! ele_parse {
        $(type AttrValueError = $evty:ty;)?
        type Object = $objty:ty;

-        $($rest:tt)*
+        $(
+            [super] {
+                $($super_body:tt)*
+            };
+        )?
+
+        // Combination of square brackets above and the prefix here are
+        //   needed for disambiguation.
+        $nt_first:ident := $($nt_defs:tt)*
    ) => {
        ele_parse! {@!next $vis $super
            $(type AttrValueError = $evty;)?
            type Object = $objty;
-            $($rest)*
+            $nt_first := $($nt_defs)*
        }

-        ele_parse!(@!super_sum <$objty> $vis $super $($rest)*);
+        ele_parse!(@!super_sum <$objty> $vis $super
+            $([super] { $($super_body)* })?
+            $nt_first := $($nt_defs)*
+        );
    };

    (@!next $vis:vis $super:ident
@ -423,11 +427,6 @@ macro_rules! ele_parse {
        //   (defaulting to Incomplete via @!ele_expand_body).
        /$($close_span:ident)? => $closemap:expr,

-        // Non-whitespace text nodes can be mapped into elements with the
-        //   given QName as a preprocessing step,
-        //     allowing them to reuse the existing element NT system.
-        $([text]($text:ident, $text_span:ident) => $text_map:expr,)?
-
        // Nonterminal references.
        <> {
            $(
@ -547,16 +546,60 @@ macro_rules! ele_parse {
                    crate::xir::parse::NodeMatcher::from($qname)
                }

-                /// Yield the expected depth of child elements,
-                ///   if known.
-                #[allow(dead_code)] // used by text special form
-                fn child_depth(&self) -> Option<crate::xir::flat::Depth> {
+                /// Whether the parser is in a state that can tolerate
+                ///   superstate node preemption.
+                ///
+                /// For more information,
+                ///   see the superstate
+                #[doc=concat!(
+                    " [`", stringify!($super), "::can_preempt_node`]."
+                )]
+                fn can_preempt_node(&self) -> bool {
+                    use $nt::*;
+
                    match self {
-                        $ntfirst((_, _, _, depth)) => Some(depth.child_depth()),
+                        // Preemption before the opening tag is safe,
+                        //   since we haven't started processing yet.
+                        Expecting_(..) => true,
+
+                        // Preemption during recovery would cause tokens to
+                        //   be parsed when they ought to be ignored,
+                        //     so we must process all tokens during recovery.
+                        RecoverEleIgnore_(..)
+                        | CloseRecoverIgnore_(..) => false,
+
+                        // It is _not_ safe to preempt attribute parsing
+                        //   since attribute parsers aggregate until a
+                        //   non-attribute token is encountered;
+                        //     we must allow attribute parsing to finish its
+                        //     job _before_ any preempted nodes are emitted
+                        //     since the attributes came _before_ that node.
+                        Attrs_(..) => false,
+
+                        // These states represent jump states where we're
+                        //   about to transition to the next child parser.
+                        // It's safe to preempt here,
+                        //   since we're not in the middle of parsing.
+                        //
+                        // Note that this includes `ExpectClose_` because of
+                        //   the macro preprocessing,
+                        //     and Rust's exhaustiveness check will ensure
+                        //     that it is accounted for if that changes.
+                        // If we're expecting that the next token is a
+                        //   `Close`,
+                        //     then it must be safe to preempt other nodes
+                        //     that may appear in this context as children.
+                        $ntfirst(..) => true,
                        $(
-                            $ntnext((_, _, _, depth)) => Some(depth.child_depth()),
+                            $ntnext(..) => true,
                        )*
-                        _ => None,
+
+                        // Preemption after closing is similar to preemption
+                        //   in `Expecting_`,
+                        //     in that we're effectively in the parent
+                        //     context.
+                        RecoverEleIgnoreClosed_(..)
+                        | Closed_(..) => true,
                    }
                }
            }
@ -731,10 +774,6 @@ macro_rules! ele_parse {
                        },
                    };

-                    // Used only by _some_ expansions.
-                    #[allow(unused_imports)]
-                    use crate::xir::flat::Text;
-
                    use $nt::{
                        Attrs_, Expecting_, RecoverEleIgnore_,
                        CloseRecoverIgnore_, RecoverEleIgnoreClosed_,
@ -916,6 +955,38 @@ macro_rules! ele_parse {
                Done_,
            }

+            impl $nt {
+                /// Whether the parser is in a state that can tolerate
+                ///   superstate node preemption.
+                ///
+                /// For more information,
+                ///   see the superstate
+                #[doc=concat!(
+                    " [`", stringify!($super), "::can_preempt_node`]."
+                )]
+                fn can_preempt_node(&self) -> bool {
+                    use $nt::*;
+
+                    match self {
+                        // Preemption before the opening tag is safe,
+                        //   since we haven't started processing yet.
+                        Expecting_(..) => true,
+
+                        // Preemption during recovery would cause tokens to
+                        //   be parsed when they ought to be ignored,
+                        //     so we must process all tokens during recovery.
+                        RecoverEleIgnore_(..) => false,
+
+                        // Preemption after closing is similar to preemption
+                        //   in `Expecting_`,
+                        //     in that we're effectively in the parent
+                        //     context.
+                        RecoverEleIgnoreClosed_(..)
+                        | Done_ => true,
+                    }
+                }
+            }
+
            impl std::fmt::Display for $nt {
                fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
                    use crate::{
@ -1122,6 +1193,14 @@ macro_rules! ele_parse {
    //       and we have to do so in a way that we can aggregate all of
    //       those data.
    (@!super_sum <$objty:ty> $vis:vis $super:ident
+        $(
+            [super] {
+                // Non-whitespace text nodes can be mapped into elements
+                //   with the given QName as a preprocessing step,
+                //     allowing them to reuse the existing element NT system.
+                $([text]($text:ident, $text_span:ident) => $text_map:expr,)?
+            }
+        )?
        $(
            // NT definition is always followed by `:=`.
            $nt:ident :=
@ -1247,7 +1326,31 @@ macro_rules! ele_parse {
                        xir::flat::{XirfToken, RefinedText},
                    };

+                    // Used only by _some_ expansions.
+                    #[allow(unused_imports)]
+                    use crate::xir::flat::Text;
+
                    match (self, tok) {
+                        // [super] {
+                        $(
+                            // [text] preemption;
+                            //   see `Self::can_preempt_node`.
+                            $(
+                                (
+                                    st,
+                                    XirfToken::Text(
+                                        RefinedText::Unrefined(
+                                            Text($text, $text_span)
+                                        ),
+                                        _,
+                                    )
+                                ) if st.can_preempt_node() => {
+                                    Transition(st).ok($text_map)
+                                },
+                            )?
+                        )?
+                        // }
+
                        // Depth check is unnecessary since _all_ xir::parse
                        //   parsers
                        //     (at least at the time of writing)
@ -1258,7 +1361,6 @@ macro_rules! ele_parse {
                        (
                            st,
                            XirfToken::Text(RefinedText::Whitespace(..), _)
-                            | XirfToken::Text(RefinedText::Unrefined(..), _) // XXX
                            | XirfToken::Comment(..)
                        ) => {
                            Transition(st).incomplete()
@ -1317,6 +1419,40 @@ macro_rules! ele_parse {
                        )*
                    }
                }
+
+                /// Whether the inner parser is in a state that can tolerate
+                ///   superstate node preemption.
+                ///
+                /// Node preemption allows us (the superstate) to ask for
+                ///   permission from the inner parser to parse some token
+                ///   ourselves,
+                ///     by asking whether the parser is in a state that
+                ///     would cause semantic issues if we were to do so.
+                ///
+                /// For example,
+                ///   if we were to preempt text nodes while an inner parser
+                ///   was still parsing attributes,
+                ///     then we would emit an object associated with that
+                ///     text before the inner parser had a chance to
+                ///     conclude that attribute parsing has completed and
+                ///     emit the opening object for that node;
+                ///       the result would otherwise be an incorrect
+                ///       `Text, Open` instead of the correct `Open, Text`,
+                ///         which would effectively unparent the text.
+                /// Similarly,
+                ///   if we were to parse our own tokens while an inner
+                ///   parser was performing error recovery in such a way as
+                ///   to ignore all child tokens,
+                ///     then we would emit an object in an incorrect
+                ///     context.
+                #[allow(dead_code)] // TODO: Remove when using for tpl apply
+                fn can_preempt_node(&self) -> bool {
+                    match self {
+                        $(
+                            Self::$nt(st) => st.can_preempt_node(),
+                        )*
+                    }
+                }
            }
        }
    };
--- a/tamer/src/xir/parse/ele/test.rs
+++ b/tamer/src/xir/parse/ele/test.rs
@ -1554,14 +1554,8 @@ fn sum_repetition() {
 }

 // Text nodes may appear between elements if a `[text]` special form
-//   specifies a mapping.
+//   specifies a mapping on the superstate.
 // This is "mixed content" in XML.
-//
-// The `[text]` mapping applies to all text nodes at the child depth of the
-//   element,
-//     meaning it'll preempt sum parser delegation to provide the desired
-//     behavior.
-#[ignore] // TODO: ignoring text nodes for now; fix!
 #[test]
 fn mixed_content_text_nodes() {
    #[derive(Debug, PartialEq, Eq)]
@ -1569,8 +1563,7 @@ fn mixed_content_text_nodes() {
        Root,
        A,
        B,
-        TextRoot(SymbolId, Span),
-        TextA(SymbolId, Span),
+        Text(SymbolId, Span),
    }

    impl crate::parse::Object for Foo {}
@ -1583,14 +1576,17 @@ fn mixed_content_text_nodes() {
        enum Sut;
        type Object = Foo;

+        [super] {
+            // The `[text]` special form here introduces a `Text` mapping
+            //   for all non-whitespace text nodes.
+            [text](sym, span) => Foo::Text(sym, span),
+        };
+
        Root := QN_SUT {
            @ {} => Foo::Root,

-            // The `[text]` special form here introduces a `Text` mapping
-            //   for all non-whitespace text node at the same depth as our
-            //   child elements.
-            [text](sym, span) => Foo::TextRoot(sym, span),
-
+            // Text allowed at any point between these elements because of
+            //   the `[super]` definition.
            A,
            AB[*],
        };
@ -1598,10 +1594,8 @@ fn mixed_content_text_nodes() {
        A := QN_A {
            @ {} => Foo::A,

-            // The child should not have its text processing preempted.
-            [text](sym, span) => Foo::TextA(sym, span),
-
-            // Text should be permitted even though we permit no children.
+            // Text should be permitted even though we permit no children,
+            //   because of the `[super]` definition.
        };

        // Used only for `AB`.
@ -1626,21 +1620,17 @@ fn mixed_content_text_nodes() {
        XirfToken::Open(QN_SUT, OpenSpan(S1, N), Depth(0)),
        // Whitespace will not match the `[text]` special form.
        tok_ws.clone(),
-        // Text node for the root (Root).
+        // Text before root open.
+        // This must be emitted as a _child_ of Root,
+        //   meaning that Root must be given the opportunity to report that
+        //   attribute parsing is finished before we emit the object.
        XirfToken::Text(RefinedText::Unrefined(Text(text_root, S1)), Depth(1)),
-        // This child also expects text nodes,
-        //   and should be able to yield its own parse.
        XirfToken::Open(QN_A, OpenSpan(S2, N), Depth(1)),
-        // If this goes wrong,
-        //   and Root does not properly check its depth,
-        //   then the parser would erroneously yield `Foo::TextRoot` for
-        //     this token.
+        // Text within a child.
        XirfToken::Text(RefinedText::Unrefined(Text(text_a, S2)), Depth(2)),
        XirfToken::Close(None, CloseSpan::empty(S3), Depth(1)),
-        // Now we're about to parse with `AB`,
-        //   which itself cannot handle text.
-        // But text should never reach that parser,
-        //   having been preempted by Root.
+        // Text _after_ a child node,
+        //   which does not require ending attribute parsing before emitting.
        XirfToken::Text(RefinedText::Unrefined(Text(text_root, S3)), Depth(1)),
        // Try to yield A again with text.
        XirfToken::Open(QN_A, OpenSpan(S3, N), Depth(1)),
@ -1654,36 +1644,33 @@ fn mixed_content_text_nodes() {
    use Parsed::*;
    assert_eq!(
        Ok(vec![
-            Incomplete,                           // [Root]  Root Open
-            Incomplete,                           // [Root@] WS
-            Object(Foo::Root),                    // [Root@] Text (>LA)
-            Object(Foo::TextRoot(text_root, S1)), // [Root]  Text (<LA)
-            Incomplete,                           // [A]     A Open (<LA)
-            Object(Foo::A),                       // [A@]    A Text (>LA)
-            Object(Foo::TextA(text_a, S2)),       // [A]     Text (<LA)
-            Incomplete,                           // [A]     A Close
-            Object(Foo::TextRoot(text_root, S3)), // [Root]   Text
-            Incomplete,                           // [A]     A Open
-            Object(Foo::A),                       // [A@]    A Text (>LA)
-            Object(Foo::TextA(text_a, S4)),       // [A]     Text (<LA)
-            Incomplete,                           // [A]     A Close
-            Object(Foo::TextRoot(text_root, S5)), // [Root]  Text
-            Incomplete,                           // [Root]  Root Close
+            Incomplete,                       // [Root]  Root Open
+            Incomplete,                       // [Root@] WS
+            Object(Foo::Root),                // [Root@] Text (>LA)
+            Object(Foo::Text(text_root, S1)), // [Root]  Text (<LA)
+            Incomplete,                       // [A]     A Open (<LA)
+            Object(Foo::A),                   // [A@]    A Text (>LA)
+            Object(Foo::Text(text_a, S2)),    // [A]     Text (<LA)
+            Incomplete,                       // [A]     A Close
+            Object(Foo::Text(text_root, S3)), // [Root]  Text
+            Incomplete,                       // [A]     A Open
+            Object(Foo::A),                   // [A@]    A Text (>LA)
+            Object(Foo::Text(text_a, S4)),    // [A]     Text (<LA)
+            Incomplete,                       // [A]     A Close
+            Object(Foo::Text(text_root, S5)), // [Root]  Text
+            Incomplete,                       // [Root]  Root Close
        ]),
        Sut::parse(toks.into_iter()).collect(),
    );
 }

 /// Contrast this test with [`mixed_content_text_nodes`] above.
-#[ignore] // TODO: ignoring text nodes for now; fix!
-#[allow(dead_code)]
 #[test]
-fn mixed_content_text_nodes_with_non_mixed_content_child() {
+fn no_mixed_content_super() {
    #[derive(Debug, PartialEq, Eq)]
    enum Foo {
        Root,
        A,
-        TextRoot(SymbolId, Span),
    }

    impl crate::parse::Object for Foo {}
@ -1691,6 +1678,7 @@ fn mixed_content_text_nodes_with_non_mixed_content_child() {
    const QN_SUT: QName = QN_PACKAGE;
    const QN_A: QName = QN_CLASSIFY;

+    // No text permitted.
    ele_parse! {
        enum Sut;
        type Object = Foo;
@ -1698,19 +1686,11 @@ fn mixed_content_text_nodes_with_non_mixed_content_child() {
        Root := QN_SUT {
            @ {} => Foo::Root,

-            // Mixed content permitted at root
-            //   (but we won't be providing text for it in this test).
-            [text](sym, span) => Foo::TextRoot(sym, span),
-
-            // But this child will not permit text.
            A,
        };

        A := QN_A {
            @ {} => Foo::A,
-
-            // Missing `[text`];
-            //   no mixed content permitted.
        };
    }

@ -1719,9 +1699,7 @@ fn mixed_content_text_nodes_with_non_mixed_content_child() {
    let toks = vec![
        XirfToken::Open(QN_SUT, OpenSpan(S1, N), Depth(0)),
        XirfToken::Open(QN_A, OpenSpan(S2, N), Depth(1)),
-        // Even though the root permits text,
-        //   the child `A` does not,
-        //   and so this should result in an error.
+        // Text should not be permitted.
        XirfToken::Text(RefinedText::Unrefined(Text(text_a, S2)), Depth(2)),
        XirfToken::Close(None, CloseSpan::empty(S3), Depth(1)),
        XirfToken::Close(Some(QN_SUT), CloseSpan(S6, N), Depth(0)),
@ -1738,11 +1716,9 @@ fn mixed_content_text_nodes_with_non_mixed_content_child() {
    assert_eq!(sut.next(), Some(Ok(Incomplete))); // [A] A Open (<LA)
    assert_eq!(sut.next(), Some(Ok(Object(Foo::A)))); // [A@] Text (>LA)

-    // The next token is `Text`,
-    //   which is not expected by `A` and so should produce an error.
-    // The error that it produces at the time of writing is different than
-    //   the error that it will eventually produce,
-    //     so let's just expect some sort of error.
+    // The next token is text,
+    //   which is not permitted because of a lack of `[super]` with
+    //   `[text`].
    assert_matches!(sut.next(), Some(Err(_))); // [A] Text (<LA)

    // A then enters recovery,