tamer: xir::parse::ele: Introduce sum nonterminals

This introduces `Nt := (A | ... | Z);`, where `Nt` is the name of the nonterminal and `A ... Z` are the inner nonterminals---it produces a parser that provides a choice between a set of nonterminals. This is implemented efficiently by understanding the QName that is accepted by each of the inner nonterminals and delegating that token immediately to the appropriate parser. This is a benefit of using a parser generator macro over parser combinators---we do not need to implement backtracking by letting inner parsers fail, because we know ahead of time exactly what parser we need. This _does not_ verify that each of the inner parsers accept a unique QName; maybe at a later time I can figure out something for that. However, because this compiles into a `match`, there is no ambiguity---like a PEG parser, there is precedence in the face of an ambiguous token, and the first one wins. Consequently, tests would surely fail, since the latter wouldn't be able to be parsed. This also demonstrates how we can have good error suggestions for this parsing framework: because the inner nonterminals and their QNames are known at compile time, error messages simply generate a list of QNames that are expected. The error recovery strategy is the same as previously noted, and subject to the same concerns, though it may be more appropriate here: it is desirable for the inner parser to fail rather than retrying, so that the sum parser is able to fail and, once the Kleene operator is introduced, retry on another potential element. But again, that recovery strategy may happen to work in some cases, but'll fail miserably in others (e.g. placing an unknown element at the head of a block that expects a sequence of elements would potentially fail the entire block rather than just the invalid one). But more to come on that later; it's not critical at this point. I need to get parsing completed for TAME's input language. DEV-7145
2022-07-14 15:12:57 -04:00 · 2022-07-14 15:12:57 -04:00 · cf2cd882ca
parent 1fdfc0aa4d
commit cf2cd882ca
4 changed files with 369 additions and 10 deletions
--- a/tamer/src/xir/flat.rs
+++ b/tamer/src/xir/flat.rs
@ -57,7 +57,7 @@ use std::{error::Error, fmt::Display};

 /// Tag nesting depth
 ///   (`0` represents the root).
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct Depth(pub usize);

 impl Display for Depth {
--- a/tamer/src/xir/fmt.rs
+++ b/tamer/src/xir/fmt.rs
@ -19,7 +19,7 @@

 //! XIR formatting types for use with [`crate::fmt`]

-use crate::fmt::{AndQualConjList, Delim, Prefix, Raw, Tt};
+use crate::fmt::{AndQualConjList, Delim, OrQualConjList, Prefix, Raw, Tt};

 /// Denote an XML attribute by prefixing the value with `@`.
 pub type XmlAttr = Prefix<"@", Raw>;
@ -33,3 +33,7 @@ pub type OpenXmlEle = Delim<"<", ">", Raw>;
 /// Opening tag for XML element as teletypewriter
 ///   (for use in sentences).
 pub type TtOpenXmlEle = Tt<OpenXmlEle>;
+
+/// A choice of a list of opening XML tags.
+pub type OpenEleSumList =
+    OrQualConjList<"element", "one of elements", TtOpenXmlEle>;
--- a/tamer/src/xir/parse/ele.rs
+++ b/tamer/src/xir/parse/ele.rs
@ -19,6 +19,11 @@

 //! Element parser generator for parsing of [XIRF](super::super::flat).

+use crate::parse::ParseState;
+
+/// A parser accepting a single element.
+pub trait EleParseState: ParseState {}
+
 #[macro_export]
 macro_rules! ele_parse {
    (type Object = $objty:ty; $($rest:tt)*) => {
@ -41,10 +46,13 @@ macro_rules! ele_parse {
        }
    };

-    (@!nonterm_def <$objty:ty> $nt:ident ($ntreffirst:ident $(| $ntref:ident)+), $($rest:tt)*) => {
-        ele_parse!(@!ele_dfn_sum $nt [$ntfirst $($nt)*]);
+    (@!nonterm_def <$objty:ty> $nt:ident
+        ($ntref_first:ident $(| $ntref:ident)+); $($rest:tt)*
+    ) => {
+        ele_parse!(@!ele_dfn_sum <$objty> $nt [$ntref_first $($ntref)*]);

        ele_parse! {
+            type Object = $objty;
            $($rest)*
        }
    };
@ -124,7 +132,6 @@ macro_rules! ele_parse {
            )*
        }
    ) => {
-        // TODO
        paste::paste! {
            crate::attr_parse! {
                struct [<$nt AttrsState_>] -> [<$nt Attrs_>] {
@ -148,6 +155,8 @@ macro_rules! ele_parse {
                /// Recovery state ignoring all remaining tokens for this
                ///   element.
                RecoverEleIgnore_(crate::xir::QName, crate::xir::OpenSpan, Depth),
+                // Recovery completed because end tag corresponding to the
+                //   invalid element has been found.
                RecoverEleIgnoreClosed_(crate::xir::QName, crate::xir::CloseSpan),
                /// Parsing element attributes.
                Attrs_([<$nt AttrsState_>]),
@ -160,6 +169,17 @@ macro_rules! ele_parse {
                Closed_(crate::span::Span),
            }

+            impl crate::xir::parse::ele::EleParseState for $nt {}
+
+            impl $nt {
+                /// [`QName`](crate::xir::QName) of the element recognized
+                ///   by this parser.
+                #[allow(dead_code)] // used by sum parser
+                const fn qname() -> crate::xir::QName {
+                    $qname
+                }
+            }
+
            impl std::fmt::Display for $nt {
                fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
                    use crate::{
@ -366,12 +386,199 @@ macro_rules! ele_parse {
        }
    };

-    (@!ele_dfn_sum $nt:ident [$($ntref:ident)*]) => {
-        #[derive(Debug, PartialEq, Eq)]
-        enum $nt {
+    (@!ele_dfn_sum <$objty:ty> $nt:ident [$($ntref:ident)*]) => {
+        $(
+            // Provide a (hopefully) helpful error that can be corrected
+            //   rather than any obscure errors that may follow from trying
+            //   to compose parsers that were not generated with this macro.
+            assert_impl_all!($ntref: crate::xir::parse::ele::EleParseState);
+        )*
+
+        paste::paste! {
+            #[doc=concat!(
+                "Parser expecting one of ",
+                $("[`", stringify!($ntref), "`], ",)*
+                "."
+            )]
+            #[derive(Debug, PartialEq, Eq, Default)]
+            enum $nt {
+                #[default]
+                Expecting_,
+                /// Recovery state ignoring all remaining tokens for this
+                ///   element.
+                RecoverEleIgnore_(crate::xir::QName, crate::xir::OpenSpan, Depth),
+                RecoverEleIgnoreClosed_(crate::xir::QName, crate::xir::CloseSpan),
+                $(
+                    $ntref($ntref),
+                )*
+            }
+
+            impl std::fmt::Display for $nt {
+                fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+                    use crate::{
+                        fmt::{DisplayWrapper, ListDisplayWrapper, TtQuote},
+                        xir::fmt::OpenEleSumList,
+                    };
+
+                    let ntrefs = [
+                        $(
+                            $ntref::qname(),
+                        )*
+                    ];
+                    let expected = OpenEleSumList::wrap(&ntrefs);
+
+                    match self {
+                        Self::Expecting_ => {
+                            write!(f, "expecting {expected}")
+                        },
+
+                        Self::RecoverEleIgnore_(name, ..)
+                        | Self::RecoverEleIgnoreClosed_(name, ..) => write!(
+                            f,
+                            "attempting to recover by ignoring element \
+                               with unexpected name {given} \
+                               (expected {expected})",
+                            given = TtQuote::wrap(name),
+                        ),
+
+                        $(
+                            Self::$ntref(st) => std::fmt::Display::fmt(st, f),
+                        )*
+                    }
+                }
+            }
+
+            #[derive(Debug, PartialEq)]
+            enum [<$nt Error_>] {
+                UnexpectedEle_(crate::xir::QName, crate::span::Span),
+                $(
+                    $ntref([<$ntref Error_>]),
+                )*
+            }
+
            $(
-                $ntref($ntref),
+                impl From<[<$ntref Error_>]> for [<$nt Error_>] {
+                    fn from(e: [<$ntref Error_>]) -> Self {
+                        [<$nt Error_>]::$ntref(e)
+                    }
+                }
            )*
+
+            impl std::error::Error for [<$nt Error_>] {
+                fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+                    // TODO
+                    None
+                }
+            }
+
+            impl std::fmt::Display for [<$nt Error_>] {
+                fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+                    use crate::{
+                        fmt::DisplayWrapper,
+                        xir::fmt::TtOpenXmlEle,
+                    };
+
+                    match self {
+                        Self::UnexpectedEle_(qname, _) => {
+                            write!(f, "unexpected {}", TtOpenXmlEle::wrap(qname))
+                        },
+                        $(
+                            Self::$ntref(e) => std::fmt::Display::fmt(e, f),
+                        )*
+                    }
+                }
+            }
+
+            impl crate::diagnose::Diagnostic for [<$nt Error_>] {
+                fn describe(&self) -> Vec<crate::diagnose::AnnotatedSpan> {
+                    todo!()
+                }
+            }
+
+            impl crate::parse::ParseState for $nt {
+                type Token = crate::xir::flat::XirfToken;
+                type Object = $objty;
+                type Error = [<$nt Error_>];
+
+                fn parse_token(
+                    self,
+                    tok: Self::Token,
+                    _: crate::parse::NoContext,
+                ) -> crate::parse::TransitionResult<Self> {
+                    use crate::{
+                        parse::{EmptyContext, Transition},
+                        xir::flat::XirfToken,
+                    };
+
+                    use $nt::{Expecting_, RecoverEleIgnore_, RecoverEleIgnoreClosed_};
+
+                    match (self, tok) {
+                        $(
+                            (
+                                Expecting_,
+                                XirfToken::Open(qname, span, depth)
+                            ) if qname == $ntref::qname() => {
+                                $ntref::default().delegate(
+                                    XirfToken::Open(qname, span, depth),
+                                    EmptyContext,
+                                    |si| Transition(Self::$ntref(si)),
+                                    || todo!("inner dead (should not happen here)"),
+                                )
+                            },
+                        )*
+
+                        (Expecting_, XirfToken::Open(qname, span, depth)) => {
+                            Transition(RecoverEleIgnore_(qname, span, depth)).err(
+                                // Use name span rather than full `OpenSpan`
+                                //   since it's specifically the name that
+                                //   was unexpected,
+                                //     not the fact that it's an element.
+                                [<$nt Error_>]::UnexpectedEle_(qname, span.name_span())
+                            )
+                        },
+
+                        // XIRF ensures that the closing tag matches the opening,
+                        //   so we need only check depth.
+                        (
+                            RecoverEleIgnore_(qname, _, depth_open),
+                            XirfToken::Close(_, span, depth_close)
+                        ) if depth_open == depth_close => {
+                            Transition(RecoverEleIgnoreClosed_(qname, span)).incomplete()
+                        },
+
+                        (st @ RecoverEleIgnore_(..), _) => {
+                            Transition(st).incomplete()
+                        },
+
+                        $(
+                            (Self::$ntref(si), tok) => si.delegate(
+                                tok,
+                                EmptyContext,
+                                |si| Transition(Self::$ntref(si)),
+                                || todo!("inner dead"),
+                            ),
+                        )*
+
+                        todo => todo!("sum {todo:?}"),
+                    }
+                }
+
+                fn is_accepting(&self) -> bool {
+                    match self {
+                        Self::RecoverEleIgnoreClosed_(..) => true,
+
+                        // Delegate entirely to the inner ParseState.
+                        // It is desirable to maintain this state even after
+                        //   the inner parser is completed so that the inner
+                        //   state can accurately describe what took place.
+                        $(
+                            Self::$ntref(si) => si.is_accepting(),
+                        )*
+
+                        _ => false,
+                    }
+                }
+            }
        }
    };
 }
--- a/tamer/src/xir/parse/ele/test.rs
+++ b/tamer/src/xir/parse/ele/test.rs
@ -34,7 +34,7 @@ use crate::{
        attr::{Attr, AttrSpan},
        flat::{Depth, XirfToken},
        st::qname::*,
-        CloseSpan, EleNameLen, EleSpan, OpenSpan,
+        CloseSpan, EleNameLen, EleSpan, OpenSpan, QName,
    },
 };

@ -521,3 +521,151 @@ fn child_error_and_recovery() {
        sut.collect()
    );
 }
+
+// A nonterminal of the form `(A | ... | Z)` should accept the element of
+//   any of the inner nonterminals.
+#[test]
+fn sum_nonterminal_accepts_any_valid_element() {
+    #[derive(Debug, PartialEq, Eq)]
+    enum Foo {
+        A,
+        B,
+        C,
+    }
+
+    impl crate::parse::Object for Foo {}
+
+    // QNames don't matter as long as they are unique.
+    const QN_A: QName = QN_PACKAGE;
+    const QN_B: QName = QN_CLASSIFY;
+    const QN_C: QName = QN_EXPORT;
+
+    ele_parse! {
+        type Object = Foo;
+
+        Sut := (A | B | C);
+
+        A := QN_A {
+            @ {} => Foo::A,
+        }
+
+        B := QN_B {
+            @ {} => Foo::B,
+        }
+
+        C := QN_C {
+            @ {} => Foo::C,
+        }
+    }
+
+    use Parsed::*;
+    use XirfToken::{Close, Open};
+
+    // Try each in turn with a fresh instance of `Sut`.
+    [(QN_A, Foo::A), (QN_B, Foo::B), (QN_C, Foo::C)]
+        .into_iter()
+        .for_each(|(qname, obj)| {
+            let toks = vec![
+                Open(qname, OpenSpan(S1, N), Depth(0)),
+                Close(None, CloseSpan::empty(S2), Depth(0)),
+            ];
+
+            assert_eq!(
+                Ok(vec![
+                    Incomplete,  // [X] Open
+                    Object(obj), // [X@] Close (>LA)
+                    Incomplete,  // [X] Close
+                ]),
+                Sut::parse(toks.into_iter()).collect(),
+            );
+        });
+}
+
+#[test]
+fn sum_nonterminal_error_recovery() {
+    #[derive(Debug, PartialEq, Eq)]
+    enum Foo {
+        A,
+        B,
+    }
+
+    impl crate::parse::Object for Foo {}
+
+    // QNames don't matter as long as they are unique.
+    const QN_A: QName = QN_PACKAGE;
+    const QN_B: QName = QN_CLASSIFY;
+    let unexpected: QName = "unexpected".unwrap_into();
+
+    ele_parse! {
+        type Object = Foo;
+
+        Sut := (A | B);
+
+        A := QN_A {
+            @ {} => Foo::A,
+        }
+
+        B := QN_B {
+            @ {} => Foo::B,
+        }
+    }
+
+    // Something >0 just to assert that we're actually paying attention to
+    //   it when consuming tokens during recovery.
+    let depth = Depth(5);
+    let depth_child = Depth(6);
+
+    let toks = vec![
+        // Neither A nor B,
+        //   which will produce an error and enter recovery.
+        XirfToken::Open(unexpected, OpenSpan(S1, N), depth),
+        // A child element to be ignored,
+        //   to ensure that its closing tag will not halt recovery
+        //   prematurely.
+        // This further tests that it's too late to provide a valid opening
+        //   token
+        //     (which is good because we're not at the right depth).
+        XirfToken::Open(QN_A, OpenSpan(S2, N), depth_child),
+        XirfToken::Close(None, CloseSpan::empty(S3), depth_child),
+        // Closing token for the bad element at the corresponding depth,
+        //   which will end recovery.
+        XirfToken::Close(Some(unexpected), CloseSpan(S4, N), depth),
+    ];
+
+    let mut sut = Sut::parse(toks.into_iter());
+
+    // The first token of input is the unexpected element,
+    //   and so should result an error.
+    // The referenced span should be the _name_ of the element,
+    //   not the tag,
+    //   since the error is referring not to the fact that an element
+    //     was encountered
+    //       (which was expected),
+    //       but to the fact that the name was not the one expected.
+    assert_eq!(
+        sut.next(),
+        // TODO: This references generated identifiers.
+        Some(Err(ParseError::StateError(SutError_::UnexpectedEle_(
+            unexpected,
+            OpenSpan(S1, N).name_span(),
+        )))),
+    );
+
+    // We should have now entered a recovery mode whereby we discard
+    //   input until we close the element that introduced the error.
+    assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // Open child
+    assert_eq!(sut.next(), Some(Ok(Parsed::Incomplete))); // Close child
+
+    // The recovery state must not be in an accepting state,
+    //   because we didn't close at the root depth yet.
+    let (mut sut, _) =
+        sut.finalize().expect_err("recovery must not be accepting");
+
+    // The next token should close the element that is in error,
+    //   and bring us into an accepting state.
+    // But since we are not emitting tokens,
+    //   we'll still be marked as incomplete.
+    assert_eq!(Some(Ok(Parsed::Incomplete)), sut.next()); // Close root
+    sut.finalize()
+        .expect("recovery must complete in an accepting state");
+}