Revert "tamer: xir: Initial re-introduction of AttrEnd"

This reverts commit b973d36862.

Alright, I'm getting sick of fighting with myself on this.  But rather than
just removing the last commit, I'm going to keep it around, so that my
thoughts are clearly documented for my future quarrels with myself.

Firstly: this added more overhead than I wanted it to.  While it wasn't
significant, it did add 100--150ms to one of our largest systems, up from
~2.8s, which seems a bit much for a token that's really just meant to make
life easier for the parser.

Further, it seems that all I've managed to do is push my original problem to
a different layer---this started as a means to resolve having to emit both
an object and an error simultaneously in the case where aggregate attribute
parsing has completed, but we encounter an error on the next token (e.g. an
unexpected element).  But XIRF, if it's missing AttrEnd, should throw an
error, but should also recover.  Recovery is easy---just assume that it was
present---_but then we don't emit a XIRF `AttrEnd` token_, which is
necessary for downstream systems.  So we'd need to either:

  (a) emit both a token and an error; or
  (b) panic.

But if we're doing (a), then the need for `AttrEnd` goes away, because it
solves the original problem (though the other concerns of the previous
commit still stand).  (b) is not ideal at all, even though the missing token
does represent an internal system error; it's not something the user can
correct.  But, given that it's something that the user cannot correct,
doesn't that imply that it's an awkward thing to include in the token
stream?  So back to `AttrEnd` being an awkward PITA to have.

So, given (a), I'll just do that: errors will become more of a "hey, this
error just occurred, but I'm trying to recover---here's an object that you
should use if you choose to continue parsing, but it may or may not be what
you're looking for; proceed with caution".  That flips the original script:
I imagined having external systems feed recovery tokens, but this
encapsulates recovery within the parser, which really is more appropriate,
though less flexible than having an omniscient external recovery system;
such a monolith was always an awkward concept and would be difficult to
implement cleanly.

This can also potentially be implemented as a generalization of the Dead
state change that allowed an object to be emitted alongside the
lookahead/error.

Anyway, back to where I was...I'm sure I'll look back on this in the future
shaking my head, reflecting on how naive I was.

DEV-7145
main
Mike Gerwitz 2022-06-29 11:02:18 -04:00
parent b973d36862
commit a16a0d9138
6 changed files with 40 additions and 191 deletions

View File

@ -598,22 +598,6 @@ pub enum Token {
/// Element attribute value.
AttrValue(SymbolId, Span),
/// Delimiter indicating that attribute parsing has ended and that the
/// next token will be either a [`Token::Close`] for this element or a
/// child [`Token::Open`].
///
/// This token allows for streaming attribute aggregation without
/// lookahead,
/// and corresponds to the `>` terminal for start tags.
/// The token must be present if there is one or more attributes,
/// but is otherwise omitted,
/// with the `>` terminal considered part of [`Token::Open`] and
/// included in its span.
///
/// There is no corresponding attribute opening delimiter;
/// such is implied by [`Token::AttrName`].
AttrEnd(Span),
/// A portion of an element attribute value.
///
/// This allows for concatenating values into an attribute value without
@ -675,7 +659,6 @@ impl Display for Token {
Self::AttrValue(attr_val, _) => {
write!(f, "attribute value `{}`", attr_val)
}
Self::AttrEnd(_) => write!(f, "end of attribute list"),
Self::AttrValueFragment(attr_val, _) => {
write!(f, "attribute value fragment `{}`", attr_val)
}
@ -699,7 +682,6 @@ impl crate::parse::Token for Token {
| Close(_, CloseSpan(span, _))
| AttrName(_, span)
| AttrValue(_, span)
| AttrEnd(span)
| AttrValueFragment(_, span)
| Comment(_, span)
| Text(_, span)

View File

@ -37,19 +37,10 @@ use super::Attr;
/// they do not influence the automaton's state transitions.
/// The actual parsing operation is therefore a FSM,
/// not a PDA.
///
/// This parse may be used to parse a portion of an attribute list,
/// but it cannot continue after having read a [`XirToken::AttrEnd`],
/// indicating the end of the attribute list.
#[derive(Debug, Eq, PartialEq)]
pub enum AttrParseState {
/// Expecting attribute name or end of attribute list.
Empty,
/// Attribute value expected.
Name(QName, Span),
/// End of attribute list;
/// no further attributes expected.
End(Span),
}
impl ParseState for AttrParseState {
@ -62,18 +53,14 @@ impl ParseState for AttrParseState {
tok: Self::Token,
_: NoContext,
) -> TransitionResult<Self> {
use AttrParseState::{Empty, End, Name};
use AttrParseState::{Empty, Name};
match (self, tok) {
(Empty, XirToken::AttrName(name, span)) => {
Transition(Name(name, span)).incomplete()
}
(Empty, XirToken::AttrEnd(span)) => {
Transition(End(span)).incomplete()
}
(Empty | End(_), invalid) => Transition(Empty).dead(invalid),
(Empty, invalid) => Transition(Empty).dead(invalid),
(Name(name, nspan), XirToken::AttrValue(value, vspan)) => {
Transition(Empty).ok(Attr::new(name, value, (nspan, vspan)))
@ -90,7 +77,7 @@ impl ParseState for AttrParseState {
#[inline]
fn is_accepting(&self) -> bool {
matches!(self, Self::Empty | Self::End(..))
*self == Self::Empty
}
}
@ -109,9 +96,6 @@ impl Display for AttrParseState {
Name(name, _) => {
write!(f, "expecting an attribute value for {name}")
}
End(_) => {
write!(f, "finished with attribute parsing")
}
}
}
}
@ -160,24 +144,20 @@ impl Diagnostic for AttrParseError {
#[cfg(test)]
mod test {
use std::assert_matches::assert_matches;
use super::*;
use crate::{
convert::ExpectInto,
parse::{EmptyContext, ParseError, ParseStatus, Parsed},
parse::{EmptyContext, ParseStatus, Parsed},
sym::GlobalSymbolIntern,
xir::test::{close_empty, open},
};
const S1: Span = crate::span::DUMMY_SPAN;
const S2: Span = S1.offset_add(1).unwrap();
const S3: Span = S2.offset_add(1).unwrap();
const S4: Span = S3.offset_add(1).unwrap();
const S: Span = crate::span::DUMMY_SPAN;
const S2: Span = S.offset_add(1).unwrap();
#[test]
fn dead_if_first_token_is_non_attr() {
let tok = open("foo", S1);
let tok = open("foo", S);
let sut = AttrParseState::default();
@ -199,8 +179,7 @@ mod test {
let attr = "attr".unwrap_into();
let val = "val".intern();
// AttrEnd not required.
let toks = [XirToken::AttrName(attr, S1), XirToken::AttrValue(val, S2)]
let toks = [XirToken::AttrName(attr, S), XirToken::AttrValue(val, S2)]
.into_iter();
let sut = AttrParseState::parse(toks);
@ -208,18 +187,12 @@ mod test {
assert_eq!(
Ok(vec![
Parsed::Incomplete,
Parsed::Object(Attr::new(attr, val, (S1, S2))),
Parsed::Object(Attr::new(attr, val, (S, S2))),
]),
sut.collect()
);
}
// TODO: A proper token will not be substituted;
// parsers are now expected to recover themselves rather than expect
// an external system to substitute tokens,
// unless such an explicit recovery mode is provided to be
// configured by the caller.
// The default recovery mode should drop the attribute entirely.
#[test]
fn parse_fails_when_attribute_value_missing_but_can_recover() {
let attr = "bad".unwrap_into();
@ -229,7 +202,7 @@ mod test {
// This token indicates that we're expecting a value to come next in
// the token stream.
let TransitionResult(Transition(sut), result) =
sut.parse_token(XirToken::AttrName(attr, S1), &mut EmptyContext);
sut.parse_token(XirToken::AttrName(attr, S), &mut EmptyContext);
assert_eq!(result, Ok(ParseStatus::Incomplete));
// But we provide something else unexpected.
@ -237,7 +210,7 @@ mod test {
sut.parse_token(close_empty(S2), &mut EmptyContext);
assert_eq!(
result,
Err(AttrParseError::AttrValueExpected(attr, S1, close_empty(S2)))
Err(AttrParseError::AttrValueExpected(attr, S, close_empty(S2)))
);
// We should not be in an accepting state,
@ -254,46 +227,10 @@ mod test {
.parse_token(XirToken::AttrValue(recover, S2), &mut EmptyContext);
assert_eq!(
result,
Ok(ParseStatus::Object(Attr::new(attr, recover, (S1, S2)))),
Ok(ParseStatus::Object(Attr::new(attr, recover, (S, S2)))),
);
// Finally, we should now be in an accepting state.
assert!(sut.is_accepting());
}
// If `AttrEnd` is encountered,
// every otherwise-valid token after that point is rejected.
#[test]
fn parsing_ends_at_attr_end() {
let name = "attr".unwrap_into();
let val = "val".intern();
let toks = [
// This is valid.
XirToken::AttrName(name, S1),
XirToken::AttrValue(val, S2),
// But the list ends here.
XirToken::AttrEnd(S3),
// And so this token is invalid now.
XirToken::AttrName(name, S4),
]
.into_iter();
let mut sut = AttrParseState::parse(toks);
// The valid attribute should yield.
assert_eq!(Some(Ok(Parsed::Incomplete)), sut.next());
assert_eq!(
Some(Ok(Parsed::Object(Attr::new(name, val, (S1, S2))))),
sut.next()
);
// But we must stop parsing after this.
assert_eq!(Some(Ok(Parsed::Incomplete)), sut.next()); // AttrEnd
assert_matches!(
sut.next(),
Some(Err(ParseError::UnexpectedToken(given_tok, _)))
if given_tok == XirToken::AttrName(name, S4),
);
}
}

View File

@ -356,8 +356,7 @@ where
// of these tokens.
XirToken::AttrName(..)
| XirToken::AttrValue(..)
| XirToken::AttrValueFragment(..)
| XirToken::AttrEnd(..) => {
| XirToken::AttrValueFragment(..) => {
unreachable!("attribute token in NodeExpected state: {tok:?}")
}
}

View File

@ -371,8 +371,6 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
.find(|b| !Self::is_whitespace(**b))
.is_some();
let attr_len = ele.attributes_raw().len();
// The tail is anything following the last byte of the QName
// in a non-empty tag with no attributes.
// For example:
@ -398,17 +396,6 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
));
}
// (empty) (open)
// <foo bar="baz" /> <foo bar="baz"></foo>
// | |
// zero-length '>'
let attr_end_span = ctx.span_or_zz(
pos + 1 + len + attr_len,
if empty_tag { 0 } else { 1 },
);
tokbuf.push_front(Token::AttrEnd(attr_end_span));
// No tail because of attributes.
0
} else {
@ -418,7 +405,7 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
// The "attributes" buffer represents whitespace,
// so the tail is the number of bytes of
// whitespace plus the closing '>' tag delimiter.
false => attr_len + 1,
false => ele.attributes_raw().len() + 1,
}
};

View File

@ -140,15 +140,13 @@ fn empty_node_without_prefix_or_attributes() {
fn does_not_resolve_xmlns() {
new_sut!(sut = r#"<no-ns xmlns="noresolve" />"#);
// [----] [---] [-------] []
// 0 5 7 11 14 22 /25
// A B C D E
//
// 0 5 7 11 14 22 25
// A B C D
let a = DC.span(0, 6);
let b = DC.span(7, 5);
let c = DC.span(14, 9);
let d = DC.span(25, 0);
let e = DC.span(25, 2);
let d = DC.span(25, 2);
assert_eq!(
Ok(vec![
@ -156,8 +154,7 @@ fn does_not_resolve_xmlns() {
// Since we didn't parse @xmlns, it's still an attribute.
O(Token::AttrName("xmlns".unwrap_into(), b)),
O(Token::AttrValue("noresolve:UNESC".intern(), c)),
O(Token::AttrEnd(d)),
O(Token::Close(None, CloseSpan(e, 0))),
O(Token::Close(None, CloseSpan(d, 0))),
]),
sut.collect(),
);
@ -168,14 +165,13 @@ fn does_not_resolve_xmlns() {
fn empty_node_with_prefix_without_attributes_unresolved() {
new_sut!(sut = r#"<x:empty-node xmlns:x="noresolve" />"#);
// [-----------] [-----] [-------] []
// 0 12 14 20 23 31 /34
// A B C D E
// 0 12 14 20 23 31 34
// A B C D
let a = DC.span(0, 13);
let b = DC.span(14, 7);
let c = DC.span(23, 9);
let d = DC.span(34, 0);
let e = DC.span(34, 2);
let d = DC.span(34, 2);
// Should be the QName, _unresolved_.
assert_eq!(
@ -186,8 +182,7 @@ fn empty_node_with_prefix_without_attributes_unresolved() {
)),
O(Token::AttrName(("xmlns", "x").unwrap_into(), b)),
O(Token::AttrValue("noresolve:UNESC".intern(), c)),
O(Token::AttrEnd(d)),
O(Token::Close(None, CloseSpan(e, 0))),
O(Token::Close(None, CloseSpan(d, 0))),
]),
sut.collect(),
);
@ -219,8 +214,8 @@ fn prefix_with_empty_local_name_invalid_qname() {
fn multiple_attrs_ordered() {
new_sut!(sut = r#"<ele foo="a" bar="b" b:baz="c" />"#);
// [--] [-] | [-] | [---] | []
// 0 3 5 7 10 13 18 21 25 28 /31
// A B C D E F GH I
// 0 3 5 7 10 13 18 21 25 28 31
// A B C D E F G H
let a = DC.span(0, 4);
let b = DC.span(5, 3);
@ -229,8 +224,7 @@ fn multiple_attrs_ordered() {
let e = DC.span(18, 1);
let f = DC.span(21, 5);
let g = DC.span(28, 1);
let h = DC.span(31, 0);
let i = DC.span(31, 2);
let h = DC.span(31, 2);
assert_eq!(
Ok(vec![
@ -241,8 +235,7 @@ fn multiple_attrs_ordered() {
O(Token::AttrValue("b:UNESC".intern(), e)),
O(Token::AttrName(("b", "baz").unwrap_into(), f)),
O(Token::AttrValue("c:UNESC".intern(), g)),
O(Token::AttrEnd(h)),
O(Token::Close(None, CloseSpan(i, 0))),
O(Token::Close(None, CloseSpan(h, 0))),
]),
sut.collect(),
);
@ -252,8 +245,8 @@ fn multiple_attrs_ordered() {
fn empty_attr_value() {
new_sut!(sut = r#"<ele empty="" />"#);
// [--] [---] | []
// 0 3 5 9 12'14
// A B CDE
// 0 3 5 9 12 14
// A B C D
// /
// zero-length span, where
// the value _would_ be
@ -261,43 +254,14 @@ fn empty_attr_value() {
let a = DC.span(0, 4);
let b = DC.span(5, 5);
let c = DC.span(12, 0);
let d = DC.span(14, 0);
let e = DC.span(14, 2);
let d = DC.span(14, 2);
assert_eq!(
Ok(vec![
O(Token::Open("ele".unwrap_into(), OpenSpan(a, 3))),
O(Token::AttrName("empty".unwrap_into(), b)),
O(Token::AttrValue(":UNESC".intern(), c)),
O(Token::AttrEnd(d)),
O(Token::Close(None, CloseSpan(e, 0))),
]),
sut.collect(),
);
}
#[test]
fn open_node_attr() {
new_sut!(sut = r#"<ele foobar="baz"></ele>"#);
// [--] [----] [-] |[----]
// 0 3 5 10 13 15|18 22
// A B C D E
// /
// AttrEnd here contains '>'
let a = DC.span(0, 4);
let b = DC.span(5, 6);
let c = DC.span(13, 3);
let d = DC.span(17, 1);
let e = DC.span(18, 6);
assert_eq!(
Ok(vec![
O(Token::Open("ele".unwrap_into(), OpenSpan(a, 3))),
O(Token::AttrName("foobar".unwrap_into(), b)),
O(Token::AttrValue("baz:UNESC".intern(), c)),
O(Token::AttrEnd(d)),
O(Token::Close(Some("ele".unwrap_into()), CloseSpan(e, 3))),
O(Token::Close(None, CloseSpan(d, 0))),
]),
sut.collect(),
);
@ -310,16 +274,15 @@ fn open_node_attr() {
fn permits_duplicate_attrs() {
new_sut!(sut = r#"<dup attr="a" attr="b" />"#);
// [--] [--] | [--] | []
// 0 3 5 8 11 14 17 20/23
// A B C D EF G
// 0 3 5 8 11 14 17 20 23
// A B C D E F
let a = DC.span(0, 4);
let b = DC.span(5, 4);
let c = DC.span(11, 1);
let d = DC.span(14, 4);
let e = DC.span(20, 1);
let f = DC.span(23, 0);
let g = DC.span(23, 2);
let f = DC.span(23, 2);
assert_eq!(
Ok(vec![
@ -328,8 +291,7 @@ fn permits_duplicate_attrs() {
O(Token::AttrValue("a:UNESC".intern(), c)),
O(Token::AttrName("attr".unwrap_into(), d)),
O(Token::AttrValue("b:UNESC".intern(), e)),
O(Token::AttrEnd(f)),
O(Token::Close(None, CloseSpan(g, 0))),
O(Token::Close(None, CloseSpan(f, 0))),
]),
sut.collect(),
);
@ -456,16 +418,15 @@ fn sibling_nodes() {
fn child_node_with_attrs() {
new_sut!(sut = r#"<root><child foo="bar" /></root>"#);
// [----][----] [-] [-] [][-----]
// 0 5`6 11 13 18 20/23`25 31
// A B C D E F G
// 0 5`6 11 13 18 20 23`25 31
// A B C D E F
let a = DC.span(0, 6);
let b = DC.span(6, 6);
let c = DC.span(13, 3);
let d = DC.span(18, 3);
let e = DC.span(23, 0);
let f = DC.span(23, 2);
let g = DC.span(25, 7);
let e = DC.span(23, 2);
let f = DC.span(25, 7);
assert_eq!(
Ok(vec![
@ -473,9 +434,8 @@ fn child_node_with_attrs() {
O(Token::Open("child".unwrap_into(), OpenSpan(b, 5))),
O(Token::AttrName("foo".unwrap_into(), c)),
O(Token::AttrValue("bar:UNESC".intern(), d)),
O(Token::AttrEnd(e)),
O(Token::Close(None, CloseSpan(f, 0))),
O(Token::Close(Some("root".unwrap_into()), CloseSpan(g, 4))),
O(Token::Close(None, CloseSpan(e, 0))),
O(Token::Close(Some("root".unwrap_into()), CloseSpan(f, 4))),
]),
sut.collect(),
);

View File

@ -255,11 +255,6 @@ impl<S: Escaper> XmlWriter<S> for Token {
Ok(W::AttrFragmentAdjacent)
}
// AttrEnd is ignored by the writer,
// effectively making it optional for internal XIR generators
// that are used only for output.
(Self::AttrEnd(..), st) => Ok(st),
// TODO: We have no way of knowing if text should be formatted
// as CData,
// which may also be beneficial to avoid escaping if we
@ -500,17 +495,6 @@ mod test {
Ok(())
}
#[test]
fn ignores_attr_end() -> TestResult {
let result = Token::AttrEnd(S)
.write_new(WriterState::NodeOpen, &MockEscaper::default())?;
assert_eq!(result.0, b"");
assert_eq!(result.1, WriterState::NodeOpen);
Ok(())
}
#[test]
fn writes_text() -> TestResult {
let text = "test unescaped".intern();