Revert "tamer: xir: Initial re-introduction of AttrEnd"

This reverts commit b973d36862. Alright, I'm getting sick of fighting with myself on this. But rather than just removing the last commit, I'm going to keep it around, so that my thoughts are clearly documented for my future quarrels with myself. Firstly: this added more overhead than I wanted it to. While it wasn't significant, it did add 100--150ms to one of our largest systems, up from ~2.8s, which seems a bit much for a token that's really just meant to make life easier for the parser. Further, it seems that all I've managed to do is push my original problem to a different layer---this started as a means to resolve having to emit both an object and an error simultaneously in the case where aggregate attribute parsing has completed, but we encounter an error on the next token (e.g. an unexpected element). But XIRF, if it's missing AttrEnd, should throw an error, but should also recover. Recovery is easy---just assume that it was present---_but then we don't emit a XIRF `AttrEnd` token_, which is necessary for downstream systems. So we'd need to either: (a) emit both a token and an error; or (b) panic. But if we're doing (a), then the need for `AttrEnd` goes away, because it solves the original problem (though the other concerns of the previous commit still stand). (b) is not ideal at all, even though the missing token does represent an internal system error; it's not something the user can correct. But, given that it's something that the user cannot correct, doesn't that imply that it's an awkward thing to include in the token stream? So back to `AttrEnd` being an awkward PITA to have. So, given (a), I'll just do that: errors will become more of a "hey, this error just occurred, but I'm trying to recover---here's an object that you should use if you choose to continue parsing, but it may or may not be what you're looking for; proceed with caution". That flips the original script: I imagined having external systems feed recovery tokens, but this encapsulates recovery within the parser, which really is more appropriate, though less flexible than having an omniscient external recovery system; such a monolith was always an awkward concept and would be difficult to implement cleanly. This can also potentially be implemented as a generalization of the Dead state change that allowed an object to be emitted alongside the lookahead/error. Anyway, back to where I was...I'm sure I'll look back on this in the future shaking my head, reflecting on how naive I was. DEV-7145
2022-06-29 11:02:18 -04:00 · 2022-06-29 11:02:18 -04:00 · a16a0d9138
parent b973d36862
commit a16a0d9138
6 changed files with 40 additions and 191 deletions
--- a/tamer/src/xir.rs
+++ b/tamer/src/xir.rs
@ -598,22 +598,6 @@ pub enum Token {
    /// Element attribute value.
    AttrValue(SymbolId, Span),

-    /// Delimiter indicating that attribute parsing has ended and that the
-    ///   next token will be either a [`Token::Close`] for this element or a
-    ///   child [`Token::Open`].
-    ///
-    /// This token allows for streaming attribute aggregation without
-    ///   lookahead,
-    ///     and corresponds to the `>` terminal for start tags.
-    /// The token must be present if there is one or more attributes,
-    ///   but is otherwise omitted,
-    ///     with the `>` terminal considered part of [`Token::Open`] and
-    ///     included in its span.
-    ///
-    /// There is no corresponding attribute opening delimiter;
-    ///   such is implied by [`Token::AttrName`].
-    AttrEnd(Span),
-
    /// A portion of an element attribute value.
    ///
    /// This allows for concatenating values into an attribute value without
@ -675,7 +659,6 @@ impl Display for Token {
            Self::AttrValue(attr_val, _) => {
                write!(f, "attribute value `{}`", attr_val)
            }
-            Self::AttrEnd(_) => write!(f, "end of attribute list"),
            Self::AttrValueFragment(attr_val, _) => {
                write!(f, "attribute value fragment `{}`", attr_val)
            }
@ -699,7 +682,6 @@ impl crate::parse::Token for Token {
            | Close(_, CloseSpan(span, _))
            | AttrName(_, span)
            | AttrValue(_, span)
-            | AttrEnd(span)
            | AttrValueFragment(_, span)
            | Comment(_, span)
            | Text(_, span)
--- a/tamer/src/xir/attr/parse.rs
+++ b/tamer/src/xir/attr/parse.rs
@ -37,19 +37,10 @@ use super::Attr;
 ///       they do not influence the automaton's state transitions.
 /// The actual parsing operation is therefore a FSM,
 ///   not a PDA.
-///
-/// This parse may be used to parse a portion of an attribute list,
-///   but it cannot continue after having read a [`XirToken::AttrEnd`],
-///   indicating the end of the attribute list.
 #[derive(Debug, Eq, PartialEq)]
 pub enum AttrParseState {
-    /// Expecting attribute name or end of attribute list.
    Empty,
-    /// Attribute value expected.
    Name(QName, Span),
-    /// End of attribute list;
-    ///   no further attributes expected.
-    End(Span),
 }

 impl ParseState for AttrParseState {
@ -62,18 +53,14 @@ impl ParseState for AttrParseState {
        tok: Self::Token,
        _: NoContext,
    ) -> TransitionResult<Self> {
-        use AttrParseState::{Empty, End, Name};
+        use AttrParseState::{Empty, Name};

        match (self, tok) {
            (Empty, XirToken::AttrName(name, span)) => {
                Transition(Name(name, span)).incomplete()
            }

-            (Empty, XirToken::AttrEnd(span)) => {
-                Transition(End(span)).incomplete()
-            }
-
-            (Empty | End(_), invalid) => Transition(Empty).dead(invalid),
+            (Empty, invalid) => Transition(Empty).dead(invalid),

            (Name(name, nspan), XirToken::AttrValue(value, vspan)) => {
                Transition(Empty).ok(Attr::new(name, value, (nspan, vspan)))
@ -90,7 +77,7 @@ impl ParseState for AttrParseState {

    #[inline]
    fn is_accepting(&self) -> bool {
-        matches!(self, Self::Empty | Self::End(..))
+        *self == Self::Empty
    }
 }

@ -109,9 +96,6 @@ impl Display for AttrParseState {
            Name(name, _) => {
                write!(f, "expecting an attribute value for {name}")
            }
-            End(_) => {
-                write!(f, "finished with attribute parsing")
-            }
        }
    }
 }
@ -160,24 +144,20 @@ impl Diagnostic for AttrParseError {

 #[cfg(test)]
 mod test {
-    use std::assert_matches::assert_matches;
-
    use super::*;
    use crate::{
        convert::ExpectInto,
-        parse::{EmptyContext, ParseError, ParseStatus, Parsed},
+        parse::{EmptyContext, ParseStatus, Parsed},
        sym::GlobalSymbolIntern,
        xir::test::{close_empty, open},
    };

-    const S1: Span = crate::span::DUMMY_SPAN;
-    const S2: Span = S1.offset_add(1).unwrap();
-    const S3: Span = S2.offset_add(1).unwrap();
-    const S4: Span = S3.offset_add(1).unwrap();
+    const S: Span = crate::span::DUMMY_SPAN;
+    const S2: Span = S.offset_add(1).unwrap();

    #[test]
    fn dead_if_first_token_is_non_attr() {
-        let tok = open("foo", S1);
+        let tok = open("foo", S);

        let sut = AttrParseState::default();

@ -199,8 +179,7 @@ mod test {
        let attr = "attr".unwrap_into();
        let val = "val".intern();

-        // AttrEnd not required.
-        let toks = [XirToken::AttrName(attr, S1), XirToken::AttrValue(val, S2)]
+        let toks = [XirToken::AttrName(attr, S), XirToken::AttrValue(val, S2)]
            .into_iter();

        let sut = AttrParseState::parse(toks);
@ -208,18 +187,12 @@ mod test {
        assert_eq!(
            Ok(vec![
                Parsed::Incomplete,
-                Parsed::Object(Attr::new(attr, val, (S1, S2))),
+                Parsed::Object(Attr::new(attr, val, (S, S2))),
            ]),
            sut.collect()
        );
    }

-    // TODO: A proper token will not be substituted;
-    //   parsers are now expected to recover themselves rather than expect
-    //   an external system to substitute tokens,
-    //     unless such an explicit recovery mode is provided to be
-    //     configured by the caller.
-    // The default recovery mode should drop the attribute entirely.
    #[test]
    fn parse_fails_when_attribute_value_missing_but_can_recover() {
        let attr = "bad".unwrap_into();
@ -229,7 +202,7 @@ mod test {
        // This token indicates that we're expecting a value to come next in
        //   the token stream.
        let TransitionResult(Transition(sut), result) =
-            sut.parse_token(XirToken::AttrName(attr, S1), &mut EmptyContext);
+            sut.parse_token(XirToken::AttrName(attr, S), &mut EmptyContext);
        assert_eq!(result, Ok(ParseStatus::Incomplete));

        // But we provide something else unexpected.
@ -237,7 +210,7 @@ mod test {
            sut.parse_token(close_empty(S2), &mut EmptyContext);
        assert_eq!(
            result,
-            Err(AttrParseError::AttrValueExpected(attr, S1, close_empty(S2)))
+            Err(AttrParseError::AttrValueExpected(attr, S, close_empty(S2)))
        );

        // We should not be in an accepting state,
@ -254,46 +227,10 @@ mod test {
            .parse_token(XirToken::AttrValue(recover, S2), &mut EmptyContext);
        assert_eq!(
            result,
-            Ok(ParseStatus::Object(Attr::new(attr, recover, (S1, S2)))),
+            Ok(ParseStatus::Object(Attr::new(attr, recover, (S, S2)))),
        );

        // Finally, we should now be in an accepting state.
        assert!(sut.is_accepting());
    }
-
-    // If `AttrEnd` is encountered,
-    //   every otherwise-valid token after that point is rejected.
-    #[test]
-    fn parsing_ends_at_attr_end() {
-        let name = "attr".unwrap_into();
-        let val = "val".intern();
-
-        let toks = [
-            // This is valid.
-            XirToken::AttrName(name, S1),
-            XirToken::AttrValue(val, S2),
-            // But the list ends here.
-            XirToken::AttrEnd(S3),
-            // And so this token is invalid now.
-            XirToken::AttrName(name, S4),
-        ]
-        .into_iter();
-
-        let mut sut = AttrParseState::parse(toks);
-
-        // The valid attribute should yield.
-        assert_eq!(Some(Ok(Parsed::Incomplete)), sut.next());
-        assert_eq!(
-            Some(Ok(Parsed::Object(Attr::new(name, val, (S1, S2))))),
-            sut.next()
-        );
-
-        // But we must stop parsing after this.
-        assert_eq!(Some(Ok(Parsed::Incomplete)), sut.next()); // AttrEnd
-        assert_matches!(
-            sut.next(),
-            Some(Err(ParseError::UnexpectedToken(given_tok, _)))
-                if given_tok == XirToken::AttrName(name, S4),
-        );
-    }
 }
--- a/tamer/src/xir/flat.rs
+++ b/tamer/src/xir/flat.rs
@ -356,8 +356,7 @@ where
            //   of these tokens.
            XirToken::AttrName(..)
            | XirToken::AttrValue(..)
-            | XirToken::AttrValueFragment(..)
-            | XirToken::AttrEnd(..) => {
+            | XirToken::AttrValueFragment(..) => {
                unreachable!("attribute token in NodeExpected state: {tok:?}")
            }
        }
--- a/tamer/src/xir/reader.rs
+++ b/tamer/src/xir/reader.rs
@ -371,8 +371,6 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
                    .find(|b| !Self::is_whitespace(**b))
                    .is_some();

-                let attr_len = ele.attributes_raw().len();
-
                // The tail is anything following the last byte of the QName
                //   in a non-empty tag with no attributes.
                // For example:
@ -398,17 +396,6 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
                        ));
                    }

-                    //      (empty)               (open)
-                    // <foo bar="baz" />     <foo bar="baz"></foo>
-                    //                |                    |
-                    //            zero-length             '>'
-                    let attr_end_span = ctx.span_or_zz(
-                        pos + 1 + len + attr_len,
-                        if empty_tag { 0 } else { 1 },
-                    );
-
-                    tokbuf.push_front(Token::AttrEnd(attr_end_span));
-
                    // No tail because of attributes.
                    0
                } else {
@ -418,7 +405,7 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
                        // The "attributes" buffer represents whitespace,
                        //   so the tail is the number of bytes of
                        //   whitespace plus the closing '>' tag delimiter.
-                        false => attr_len + 1,
+                        false => ele.attributes_raw().len() + 1,
                    }
                };

--- a/tamer/src/xir/reader/test.rs
+++ b/tamer/src/xir/reader/test.rs
@ -140,15 +140,13 @@ fn empty_node_without_prefix_or_attributes() {
 fn does_not_resolve_xmlns() {
    new_sut!(sut = r#"<no-ns xmlns="noresolve" />"#);
    //                [----] [---]  [-------]  []
-    //                0    5 7   11 14     22 /25
-    //                  A      B        C    D E
-    //
+    //                0    5 7   11 14     22  25
+    //                  A      B        C      D

    let a = DC.span(0, 6);
    let b = DC.span(7, 5);
    let c = DC.span(14, 9);
-    let d = DC.span(25, 0);
-    let e = DC.span(25, 2);
+    let d = DC.span(25, 2);

    assert_eq!(
        Ok(vec![
@ -156,8 +154,7 @@ fn does_not_resolve_xmlns() {
            // Since we didn't parse @xmlns, it's still an attribute.
            O(Token::AttrName("xmlns".unwrap_into(), b)),
            O(Token::AttrValue("noresolve:UNESC".intern(), c)),
-            O(Token::AttrEnd(d)),
-            O(Token::Close(None, CloseSpan(e, 0))),
+            O(Token::Close(None, CloseSpan(d, 0))),
        ]),
        sut.collect(),
    );
@ -168,14 +165,13 @@ fn does_not_resolve_xmlns() {
 fn empty_node_with_prefix_without_attributes_unresolved() {
    new_sut!(sut = r#"<x:empty-node xmlns:x="noresolve" />"#);
    //                [-----------] [-----]  [-------]  []
-    //                0          12 14   20  23     31 /34
-    //                      A          B         C    D E
+    //                0          12 14   20  23     31  34
+    //                      A          B         C      D

    let a = DC.span(0, 13);
    let b = DC.span(14, 7);
    let c = DC.span(23, 9);
-    let d = DC.span(34, 0);
-    let e = DC.span(34, 2);
+    let d = DC.span(34, 2);

    // Should be the QName, _unresolved_.
    assert_eq!(
@ -186,8 +182,7 @@ fn empty_node_with_prefix_without_attributes_unresolved() {
            )),
            O(Token::AttrName(("xmlns", "x").unwrap_into(), b)),
            O(Token::AttrValue("noresolve:UNESC".intern(), c)),
-            O(Token::AttrEnd(d)),
-            O(Token::Close(None, CloseSpan(e, 0))),
+            O(Token::Close(None, CloseSpan(d, 0))),
        ]),
        sut.collect(),
    );
@ -219,8 +214,8 @@ fn prefix_with_empty_local_name_invalid_qname() {
 fn multiple_attrs_ordered() {
    new_sut!(sut = r#"<ele foo="a" bar="b" b:baz="c" />"#);
    //                [--] [-]  |  [-]  |  [---]  |  []
-    //                0  3 5 7  10 13   18 21 25 28 /31
-    //                 A    B   C  D    E    F    GH I
+    //                0  3 5 7  10 13   18 21 25 28  31
+    //                 A    B   C  D    E    F    G  H

    let a = DC.span(0, 4);
    let b = DC.span(5, 3);
@ -229,8 +224,7 @@ fn multiple_attrs_ordered() {
    let e = DC.span(18, 1);
    let f = DC.span(21, 5);
    let g = DC.span(28, 1);
-    let h = DC.span(31, 0);
-    let i = DC.span(31, 2);
+    let h = DC.span(31, 2);

    assert_eq!(
        Ok(vec![
@ -241,8 +235,7 @@ fn multiple_attrs_ordered() {
            O(Token::AttrValue("b:UNESC".intern(), e)),
            O(Token::AttrName(("b", "baz").unwrap_into(), f)),
            O(Token::AttrValue("c:UNESC".intern(), g)),
-            O(Token::AttrEnd(h)),
-            O(Token::Close(None, CloseSpan(i, 0))),
+            O(Token::Close(None, CloseSpan(h, 0))),
        ]),
        sut.collect(),
    );
@ -252,8 +245,8 @@ fn multiple_attrs_ordered() {
 fn empty_attr_value() {
    new_sut!(sut = r#"<ele empty="" />"#);
    //                [--] [---]  | []
-    //                0  3 5   9 12'14
-    //                 A    B     CDE
+    //                0  3 5   9 12 14
+    //                 A    B     C D
    //                           /
    //              zero-length span, where
    //               the value _would_ be
@ -261,43 +254,14 @@ fn empty_attr_value() {
    let a = DC.span(0, 4);
    let b = DC.span(5, 5);
    let c = DC.span(12, 0);
-    let d = DC.span(14, 0);
-    let e = DC.span(14, 2);
+    let d = DC.span(14, 2);

    assert_eq!(
        Ok(vec![
            O(Token::Open("ele".unwrap_into(), OpenSpan(a, 3))),
            O(Token::AttrName("empty".unwrap_into(), b)),
            O(Token::AttrValue(":UNESC".intern(), c)),
-            O(Token::AttrEnd(d)),
-            O(Token::Close(None, CloseSpan(e, 0))),
-        ]),
-        sut.collect(),
-    );
-}
-
-#[test]
-fn open_node_attr() {
-    new_sut!(sut = r#"<ele foobar="baz"></ele>"#);
-    //                [--] [----]  [-] |[----]
-    //                0  3 5   10 13 15|18  22
-    //                 A     B      C  D   E
-    //                                /
-    //                    AttrEnd here contains '>'
-
-    let a = DC.span(0, 4);
-    let b = DC.span(5, 6);
-    let c = DC.span(13, 3);
-    let d = DC.span(17, 1);
-    let e = DC.span(18, 6);
-
-    assert_eq!(
-        Ok(vec![
-            O(Token::Open("ele".unwrap_into(), OpenSpan(a, 3))),
-            O(Token::AttrName("foobar".unwrap_into(), b)),
-            O(Token::AttrValue("baz:UNESC".intern(), c)),
-            O(Token::AttrEnd(d)),
-            O(Token::Close(Some("ele".unwrap_into()), CloseSpan(e, 3))),
+            O(Token::Close(None, CloseSpan(d, 0))),
        ]),
        sut.collect(),
    );
@ -310,16 +274,15 @@ fn open_node_attr() {
 fn permits_duplicate_attrs() {
    new_sut!(sut = r#"<dup attr="a" attr="b" />"#);
    //                [--] [--]  |  [--]  |  []
-    //                0  3 5  8  11 14 17 20/23
-    //                 A    B    C    D   EF G
+    //                0  3 5  8  11 14 17 20 23
+    //                 A    B    C    D   E  F

    let a = DC.span(0, 4);
    let b = DC.span(5, 4);
    let c = DC.span(11, 1);
    let d = DC.span(14, 4);
    let e = DC.span(20, 1);
-    let f = DC.span(23, 0);
-    let g = DC.span(23, 2);
+    let f = DC.span(23, 2);

    assert_eq!(
        Ok(vec![
@ -328,8 +291,7 @@ fn permits_duplicate_attrs() {
            O(Token::AttrValue("a:UNESC".intern(), c)),
            O(Token::AttrName("attr".unwrap_into(), d)),
            O(Token::AttrValue("b:UNESC".intern(), e)),
-            O(Token::AttrEnd(f)),
-            O(Token::Close(None, CloseSpan(g, 0))),
+            O(Token::Close(None, CloseSpan(f, 0))),
        ]),
        sut.collect(),
    );
@ -456,16 +418,15 @@ fn sibling_nodes() {
 fn child_node_with_attrs() {
    new_sut!(sut = r#"<root><child foo="bar" /></root>"#);
    //                [----][----] [-]  [-]  [][-----]
-    //                0    5`6  11 13  18 20/23`25  31
-    //                  A     B     C    D E F    G
+    //                0    5`6  11 13  18 20 23`25  31
+    //                  A     B     C    D   E    F

    let a = DC.span(0, 6);
    let b = DC.span(6, 6);
    let c = DC.span(13, 3);
    let d = DC.span(18, 3);
-    let e = DC.span(23, 0);
-    let f = DC.span(23, 2);
-    let g = DC.span(25, 7);
+    let e = DC.span(23, 2);
+    let f = DC.span(25, 7);

    assert_eq!(
        Ok(vec![
@ -473,9 +434,8 @@ fn child_node_with_attrs() {
            O(Token::Open("child".unwrap_into(), OpenSpan(b, 5))),
            O(Token::AttrName("foo".unwrap_into(), c)),
            O(Token::AttrValue("bar:UNESC".intern(), d)),
-            O(Token::AttrEnd(e)),
-            O(Token::Close(None, CloseSpan(f, 0))),
-            O(Token::Close(Some("root".unwrap_into()), CloseSpan(g, 4))),
+            O(Token::Close(None, CloseSpan(e, 0))),
+            O(Token::Close(Some("root".unwrap_into()), CloseSpan(f, 4))),
        ]),
        sut.collect(),
    );
--- a/tamer/src/xir/writer.rs
+++ b/tamer/src/xir/writer.rs
@ -255,11 +255,6 @@ impl<S: Escaper> XmlWriter<S> for Token {
                Ok(W::AttrFragmentAdjacent)
            }

-            // AttrEnd is ignored by the writer,
-            //   effectively making it optional for internal XIR generators
-            //   that are used only for output.
-            (Self::AttrEnd(..), st) => Ok(st),
-
            // TODO: We have no way of knowing if text should be formatted
            //   as CData,
            //     which may also be beneficial to avoid escaping if we
@ -500,17 +495,6 @@ mod test {
        Ok(())
    }

-    #[test]
-    fn ignores_attr_end() -> TestResult {
-        let result = Token::AttrEnd(S)
-            .write_new(WriterState::NodeOpen, &MockEscaper::default())?;
-
-        assert_eq!(result.0, b"");
-        assert_eq!(result.1, WriterState::NodeOpen);
-
-        Ok(())
-    }
-
    #[test]
    fn writes_text() -> TestResult {
        let text = "test unescaped".intern();