tamer: xir::flat: Improve parser validation

This does a couple of things: it ensures that documents one and only one root note, and it properly handles dead transitions once parsing is complete (allowing it to be composed). This should make XIRF feature-complete for the time being. It does rely on the assumption that the reader is stripping out any trailing whitespace, so I guess we'll see if that's true as we proceed. DEV-10863
2022-03-17 23:22:38 -04:00 · 2022-03-17 23:22:38 -04:00 · 150b3b9aa4
parent f04d845452
commit 150b3b9aa4
3 changed files with 152 additions and 49 deletions
--- a/tamer/src/lib.rs
+++ b/tamer/src/lib.rs
@ -51,6 +51,9 @@
 // We _could_ do without,
 //   but this provides a nicer API.
 #![feature(explicit_generic_args_with_impl_trait)]
+// This simply removes a boilerplate `Default` impl;
+//   we can do without if this does not get finalized.
+#![feature(derive_default_enum)]
 // We build docs for private items.
 #![allow(rustdoc::private_intra_doc_links)]

--- a/tamer/src/xir/flat.rs
+++ b/tamer/src/xir/flat.rs
@ -28,8 +28,10 @@
 //!   1. All closing tags must correspond to a matching opening tag at the
 //!        same depth;
 //!   2. [`Object`] exposes the [`Depth`] of each opening/closing tag;
-//!   3. Attribute tokens are parsed into [`Attr`] objects; and
-//!   4. Parsing will fail if input ends before all elements have been
+//!   3. Attribute tokens are parsed into [`Attr`] objects;
+//!   4. Documents must begin with an element and end with the closing of
+//!        that element;
+//!   5. Parsing will fail if input ends before all elements have been
 //!        closed.
 //!
 //! XIRF lowering does not perform any dynamic memory allocation;
@ -118,29 +120,26 @@ where
 ///   allowing XIRF's parser to avoid memory allocation entirely.
 type ElementStack<const MAX_DEPTH: usize> = ArrayVec<(QName, Span), MAX_DEPTH>;

-/// XIRF parser state.
+/// XIRF document parser state.
 ///
-/// This parser is a pushdown automaton.
-#[derive(Debug, PartialEq, Eq)]
+/// This parser is a pushdown automaton that parses a single XML document.
+#[derive(Debug, Default, PartialEq, Eq)]
 pub enum State<const MAX_DEPTH: usize, SA = AttrParseState>
 where
    SA: FlatAttrParseState,
 {
-    // TODO: Ensure that non-comment nodes are not encountered before the
-    //   root,
-    //     and that we do not encounter any non-comment nodes after the
-    //     root.
+    /// Document parsing has not yet begun.
+    #[default]
+    PreRoot,
+
    /// Parsing nodes.
    NodeExpected(ElementStack<MAX_DEPTH>),

    /// Delegating to attribute parser.
    AttrExpected(ElementStack<MAX_DEPTH>, SA),
-}

-impl<const MD: usize, SA: FlatAttrParseState> Default for State<MD, SA> {
-    fn default() -> Self {
-        Self::NodeExpected(Default::default())
-    }
+    /// End of document has been reached.
+    Done,
 }

 impl<const MAX_DEPTH: usize, SA> ParseState for State<MAX_DEPTH, SA>
@ -152,9 +151,22 @@ where

    fn parse_token(self, tok: Token) -> TransitionResult<Self> {
        use ParseStatus::{Dead, Incomplete, Object as Obj};
-        use State::{AttrExpected, NodeExpected};
+        use State::{AttrExpected, Done, NodeExpected, PreRoot};

        match (self, tok) {
+            // Comments are permitted before and after the first root element.
+            (st @ (PreRoot | Done), Token::Comment(sym, span)) => {
+                Transition(st).with(Object::Comment(sym, span))
+            }
+
+            (PreRoot, tok @ Token::Open(..)) => {
+                Self::parse_node(Default::default(), tok)
+            }
+
+            (PreRoot, tok) => {
+                Transition(PreRoot).err(StateError::RootOpenExpected(tok))
+            }
+
            (NodeExpected(stack), tok) => Self::parse_node(stack, tok),

            (AttrExpected(stack, sa), tok) => match sa.parse_token(tok) {
@ -169,6 +181,8 @@ where
                    Transition(AttrExpected(stack, sa)).err(x)
                }
            },
+
+            (Done, tok) => Transition(Done).dead(tok),
        }
    }

@ -182,7 +196,7 @@ where
        // TODO: It'd be nice if we could also return additional context to
        //   aid the user in diagnosing the problem,
        //     e.g. what element(s) still need closing.
-        matches!(self, Self::NodeExpected(stack) if stack.len() == 0)
+        *self == State::Done
    }
 }

@ -196,7 +210,7 @@ where
        tok: Token,
    ) -> TransitionResult<Self> {
        use Object::*;
-        use State::{AttrExpected, NodeExpected};
+        use State::{AttrExpected, Done, NodeExpected};

        match tok {
            Token::Open(qname, span) if stack.len() == MAX_DEPTH => Transition(
@ -221,9 +235,7 @@ where

            Token::Close(close_oqname, close_span) => {
                match (close_oqname, stack.pop()) {
-                    (_, None) => Transition(NodeExpected(stack)).err(
-                        StateError::ExtraClosingTag(close_oqname, close_span),
-                    ),
+                    (_, None) => unreachable!("parser should be in Done state"),

                    (Some(qname), Some((open_qname, open_span)))
                        if qname != open_qname =>
@ -236,6 +248,13 @@ where
                        )
                    }

+                    // Final closing tag (for root node) completes the document.
+                    (..) if stack.len() == 0 => Transition(Done).with(Close(
+                        close_oqname,
+                        close_span,
+                        Depth(0),
+                    )),
+
                    (..) => {
                        let depth = stack.len();

@ -283,6 +302,9 @@ pub fn parse<const MAX_DEPTH: usize>(
 /// Parsing error from [`State`].
 #[derive(Debug, Eq, PartialEq)]
 pub enum StateError {
+    /// Opening root element tag was expected.
+    RootOpenExpected(Token),
+
    /// Opening tag exceeds the maximum nesting depth for this parser.
    MaxDepthExceeded { open: (QName, Span), max: Depth },

@ -293,10 +315,6 @@ pub enum StateError {
        close: (QName, Span),
    },

-    /// Attempt to close a tag with no corresponding opening tag
-    ///   (which would result in a negative depth).
-    ExtraClosingTag(Option<QName>, Span),
-
    /// Error from the attribute parser.
    AttrError(AttrParseError),
 }
@ -306,6 +324,14 @@ impl Display for StateError {
        use StateError::*;

        match self {
+            RootOpenExpected(tok) => {
+                write!(
+                    f,
+                    "opening root element tag expected, \
+                       but found {tok}"
+                )
+            }
+
            MaxDepthExceeded {
                open: (name, span),
                max,
@ -329,18 +355,6 @@ impl Display for StateError {
                )
            }

-            ExtraClosingTag(Some(name), span) => {
-                write!(f, "closing tag `{name}` at {span} has no opening tag",)
-            }
-
-            // If this occurs, its likely that something generated invalid
-            //   XIR;
-            //     it should be a parsing error on read and no generator
-            //     should ever produce this.
-            ExtraClosingTag(None, span) => {
-                write!(f, "self-closing tag at {span} has no opening tag")
-            }
-
            AttrError(e) => Display::fmt(e, f),
        }
    }
--- a/tamer/src/xir/flat/test.rs
+++ b/tamer/src/xir/flat/test.rs
@ -71,19 +71,25 @@ fn empty_element_balanced_close() {
 }

 // More closing tags than opening.
+//
+// We cannot keep the token and throw our own error because this tag may be
+//   part of a parent context.
 #[test]
 fn extra_closing_tag() {
    let name = ("ns", "openclose").unwrap_into();
-    let toks = [Token::Close(Some(name), S)].into_iter();
+    let toks = [
+        // We need an opening tag to actually begin document parsing.
+        Token::Open(name, S),
+        Token::Close(Some(name), S2),
+        Token::Close(Some(name), S3),
+    ]
+    .into_iter();

-    let mut sut = parse::<1>(toks);
+    let sut = parse::<1>(toks);

    assert_eq!(
-        sut.next(),
-        Some(Err(ParseError::StateError(StateError::ExtraClosingTag(
-            Some(name),
-            S,
-        ))))
+        Err(ParseError::UnexpectedToken(Token::Close(Some(name), S3),)),
+        sut.collect::<Result<Vec<Parsed<Object>>, _>>()
    );
 }

@ -92,15 +98,20 @@ fn extra_closing_tag() {
 // gotten to XIRF).
 #[test]
 fn extra_self_closing_tag() {
-    let toks = [Token::Close(None, S)].into_iter();
+    let name = ("ns", "openclose").unwrap_into();
+    let toks = [
+        // We need an opening tag to actually begin document parsing.
+        Token::Open(name, S),
+        Token::Close(None, S2),
+        Token::Close(None, S3),
+    ]
+    .into_iter();

-    let mut sut = parse::<1>(toks);
+    let sut = parse::<1>(toks);

    assert_eq!(
-        sut.next(),
-        Some(Err(ParseError::StateError(StateError::ExtraClosingTag(
-            None, S,
-        ))))
+        Err(ParseError::UnexpectedToken(Token::Close(None, S3),)),
+        sut.collect::<Result<Vec<Parsed<Object>>, _>>()
    );
 }

@ -355,3 +366,78 @@ fn not_accepting_state_if_element_open() {
    // Element was not closed.
    assert_eq!(Some(Err(ParseError::UnexpectedEof(Some(S)))), sut.next());
 }
+
+// XML permits comment nodes before and after the document root element.
+#[test]
+fn comment_before_or_after_root_ok() {
+    let name = "root".unwrap_into();
+    let cstart = "start comment".intern();
+    let cend = "end comment".intern();
+
+    let toks = [
+        Token::Comment(cstart, S),
+        Token::Open(name, S2),
+        Token::Close(None, S3),
+        Token::Comment(cend, S4),
+    ]
+    .into_iter();
+
+    let sut = parse::<1>(toks);
+
+    assert_eq!(
+        Ok(vec![
+            Parsed::Object(Object::Comment(cstart, S)),
+            Parsed::Object(Object::Open(name, S2, Depth(0))),
+            Parsed::Object(Object::Close(None, S3, Depth(0))),
+            Parsed::Object(Object::Comment(cend, S4)),
+        ]),
+        sut.collect(),
+    );
+}
+
+// But there must be no content at the end of the document after the closing
+//   root node.
+// This does not test every applicable token;
+//   you can easily verify the actual implementation at a glance.
+//
+// This is just a dead parser state,
+//   since it's possible for XIRF to be composed and we want to return to
+//   the parent parser.
+#[test]
+fn content_after_root_close_error() {
+    let name = "root".unwrap_into();
+
+    let toks = [
+        Token::Open(name, S),
+        Token::Close(None, S2),
+        // Document ends here
+        Token::Open(name, S3),
+    ]
+    .into_iter();
+
+    let sut = parse::<1>(toks);
+
+    assert_eq!(
+        Result::<Vec<Parsed<Object>>, _>::Err(ParseError::UnexpectedToken(
+            Token::Open(name, S3)
+        )),
+        sut.collect()
+    );
+}
+
+// Non-comment nodes cannot appear before the opening root tag.
+#[test]
+fn content_before_root_open_error() {
+    let text = "foo".intern();
+
+    let toks = [Token::Text(text, S)].into_iter();
+
+    let sut = parse::<1>(toks);
+
+    assert_eq!(
+        Result::<Vec<Parsed<Object>>, _>::Err(ParseError::StateError(
+            StateError::RootOpenExpected(Token::Text(text, S))
+        )),
+        sut.collect()
+    );
+}