tamer: xir::Token::AttrEnd: Remove

More information can be found in the prior commit message, but I'll
summarize here.

This token was introduced to create a LL(0) parser---no tokens of
lookahead.  This allowed the underlying TokenStream to be freely passed to
the next system that needed it.

Since then, Parser and ParseState were introduced, along with
ParseStatus::Dead, which introduces the concept of lookahead for a single
token---an LL(1) grammar.

I had always suspected that this would happen, given the awkwardness of
AttrEnd; it was just a matter of time before the right abstraction
manifested itself to handle lookahead.

DEV-11339
main
Mike Gerwitz 2021-12-17 10:14:31 -05:00
parent 61f7a12975
commit 0cc0bc9d5a
9 changed files with 26 additions and 131 deletions

View File

@ -56,7 +56,6 @@ fn parses_package_attrs() {
Token::AttrValue(raw::L_TRUE, DS),
Token::AttrName(("preproc", "elig-class-yields").unwrap_into(), DS),
Token::AttrValue(elig, DS),
Token::AttrEnd(DS),
]
.into_iter(),
);
@ -84,7 +83,6 @@ fn parses_package_attrs_with_ns_prefix() {
Token::Open(("lv", "package").unwrap_into(), DS),
Token::AttrName("name".unwrap_into(), DS),
Token::AttrValue(name, DS),
Token::AttrEnd(DS),
]
.into_iter(),
);

View File

@ -509,28 +509,6 @@ pub enum Token {
/// but it was removed.)
AttrValueFragment(SymbolId, Span),
/// A delimiter indicating that attribute processing has ended and the
/// next token will be either a child node or [`Token::Close`].
///
/// This allows for streaming attribute collection without any
/// lookahead,
/// which would otherwise require an iterator supporting a `peek`
/// operation.
///
/// This is mandatory for _readers_ to produce,
/// but _writers must ignore it and not require it to be present_,
/// allowing for the reduction of token counts for generated XIR in
/// situations where we know that it will not be further parsed.
///
/// The [`Span`] ought to be the final byte of the preceding attribute,
/// and is required only so that we can guarantee an API that can
/// produce a [`Span`] for any given [`Token`].
/// The span position cannot be after the preceding attribute because,
/// if attributes are parsed in isolation,
/// the following byte is outside of the context that we are permitted
/// to parse.
AttrEnd(Span),
/// Comment node.
Comment(SymbolId, Span),
@ -575,7 +553,6 @@ impl Display for Token {
Self::AttrValueFragment(attr_val, span) => {
write!(f, "attribute value fragment `{}` at {}", attr_val, span)
}
Self::AttrEnd(span) => write!(f, "end of attributes at {}", span),
// TODO: Safe truncated comment.
Self::Comment(_, span) => write!(f, "comment at {}", span),
// TODO: Safe truncated text.
@ -600,7 +577,6 @@ impl Token {
| AttrName(_, span)
| AttrValue(_, span)
| AttrValueFragment(_, span)
| AttrEnd(span)
| Comment(_, span)
| Text(_, span)
| CData(_, span)

View File

@ -226,11 +226,6 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
tokbuf.push_front(Token::AttrValue(value, DUMMY_SPAN));
}
// Indicate the end of attributes even if no attributes were output.
// This allows for a reliable delimiter that can be used without
// lookahead for streaming attribute parsing.
tokbuf.push_front(Token::AttrEnd(DUMMY_SPAN));
Ok(())
}
}

View File

@ -95,7 +95,6 @@ fn empty_node_without_prefix_or_attributes() {
result.expect("parsing failed"),
vec![
Token::Open("empty-node".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
],
);
@ -115,7 +114,6 @@ fn does_not_resolve_xmlns() {
// Since we didn't parse @xmlns, it's still an attribute.
Token::AttrName("xmlns".unwrap_into(), DUMMY_SPAN),
Token::AttrValue("noresolve:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
],
);
@ -135,7 +133,6 @@ fn empty_node_with_prefix_without_attributes_unresolved() {
Token::Open(("x", "empty-node").unwrap_into(), DUMMY_SPAN),
Token::AttrName(("xmlns", "x").unwrap_into(), DUMMY_SPAN),
Token::AttrValue("noresolve:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
],
);
@ -174,7 +171,6 @@ fn multiple_attrs_ordered() {
Token::AttrValue("b:UNESC".intern(), DUMMY_SPAN),
Token::AttrName(("b", "baz").unwrap_into(), DUMMY_SPAN),
Token::AttrValue("c:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
],
);
@ -196,7 +192,6 @@ fn permits_duplicate_attrs() {
Token::AttrValue("a:UNESC".intern(), DUMMY_SPAN),
Token::AttrName("attr".unwrap_into(), DUMMY_SPAN),
Token::AttrValue("b:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
],
);
@ -212,9 +207,7 @@ fn child_node_self_closing() {
result.expect("parsing failed"),
vec![
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
],
@ -231,12 +224,9 @@ fn sibling_nodes() {
result.expect("parsing failed"),
vec![
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
],
@ -253,11 +243,9 @@ fn child_node_with_attrs() {
result.expect("parsing failed"),
vec![
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrName("foo".unwrap_into(), DUMMY_SPAN),
Token::AttrValue("bar:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
],
@ -274,7 +262,6 @@ fn child_text() {
result.expect("parsing failed"),
vec![
Token::Open("text".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Text("foo bar:UNESC".into(), DUMMY_SPAN),
Token::Close(Some("text".unwrap_into()), DUMMY_SPAN),
],
@ -291,10 +278,8 @@ fn mixed_child_content() {
result.expect("parsing failed"),
vec![
Token::Open("text".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Text("foo:UNESC".into(), DUMMY_SPAN),
Token::Open("em".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Text("bar:UNESC".into(), DUMMY_SPAN),
Token::Close(Some("em".unwrap_into()), DUMMY_SPAN),
Token::Close(Some("text".unwrap_into()), DUMMY_SPAN),
@ -322,10 +307,8 @@ fn mixed_child_content_with_newlines() {
vec![
Token::Text("\n:UNESC".into(), DUMMY_SPAN),
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Text("\n :UNESC".into(), DUMMY_SPAN),
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
Token::Text("\n:UNESC".into(), DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
@ -345,7 +328,6 @@ fn comment() {
vec![
Token::Comment("root".into(), DUMMY_SPAN),
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Comment("<child>".into(), DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
],
@ -367,7 +349,6 @@ lines-->
result.expect("parsing failed"),
vec![
Token::Open("mult".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Comment("comment\non multiple\nlines".into(), DUMMY_SPAN),
Token::Text("\n:UNESC".into(), DUMMY_SPAN),
Token::Close(Some("mult".unwrap_into()), DUMMY_SPAN),
@ -386,9 +367,7 @@ fn permits_mismatched_tags() {
result.expect("parsing failed"),
vec![
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd(DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
Token::Close(Some("mismatch".unwrap_into()), DUMMY_SPAN),
],

View File

@ -533,11 +533,6 @@ impl<SA: StackAttrParseState> ParseState for Stack<SA> {
*self = Self::AttrState(estack, attrs, sa);
Ok(Incomplete)
}
// This will likely go away with AttrEnd.
Ok(Done) => {
*self = Self::BuddingElement(estack.consume_attrs(attrs));
Ok(Incomplete)
}
Ok(Dead(lookahead)) => {
*self = Self::BuddingElement(estack.consume_attrs(attrs));
self.parse_token(lookahead)
@ -596,8 +591,6 @@ impl<SA: StackAttrParseState> Stack<SA> {
Self::BuddingElement(pstack) => Some(pstack.store()),
// Opening a child element in attribute parsing context.
// Automatically close the attributes despite a missing
// AttrEnd to accommodate non-reader XIR.
Self::BuddingAttrList(pstack, attr_list) => {
Some(pstack.consume_attrs(attr_list).store())
}
@ -626,10 +619,6 @@ impl<SA: StackAttrParseState> Stack<SA> {
.try_close(name, span)
.map(ElementStack::consume_child_or_complete),
// We can implicitly complete the attribute list if there's a
// missing `Token::AttrEnd`,
// which alleviates us from having to unnecessarily generate
// it outside of readers.
Self::BuddingAttrList(stack, attr_list) => stack
.consume_attrs(attr_list)
.try_close(name, span)
@ -660,10 +649,6 @@ impl<SA: StackAttrParseState> Stack<SA> {
ParseStatus::Object(Tree::Element(ele))
}
// This parser has completed relative to its initial context and
// is not expecting any further input.
Stack::Done => ParseStatus::Done,
_ => {
*self = new_stack;
ParseStatus::Incomplete

View File

@ -52,8 +52,6 @@ impl ParseState for AttrParserState {
use AttrParserState::*;
match (take(self), tok) {
(Empty, Token::AttrEnd(_)) => return Ok(ParseStatus::Done),
(Empty, Token::AttrName(name, span)) => {
*self = Name(name, span);
Ok(ParseStatus::Incomplete)
@ -187,11 +185,11 @@ mod test {
// But we provide something else unexpected.
assert_eq!(
sut.parse_token(Token::AttrEnd(*S2)),
sut.parse_token(Token::Close(None, *S2)),
Err(AttrParseError::AttrValueExpected(
attr,
*S,
Token::AttrEnd(*S2)
Token::Close(None, *S2)
))
);
@ -214,12 +212,4 @@ mod test {
// Finally, we should now be in an accepting state.
assert!(sut.is_accepting());
}
#[test]
fn yields_none_on_attr_end() {
let mut sut = AttrParserState::default();
assert_eq!(sut.parse_token(Token::AttrEnd(*S)), Ok(ParseStatus::Done));
assert!(sut.is_accepting());
}
}

View File

@ -169,8 +169,6 @@ impl<S: ParseState, I: TokenStream> Iterator for Parser<S, I> {
use ParseStatus::*;
match self.state.parse_token(tok) {
Ok(Done) => None,
// Nothing handled this dead state,
// and we cannot discard a lookahead token,
// so we have no choice but to produce an error.
@ -303,8 +301,7 @@ pub enum ParseStatus<T> {
/// Parsing of an object is complete.
///
/// This does not indicate that the parser is complete,
/// as more objects may be able to be emitted;
/// see [`ParseStatus::Done`].
/// as more objects may be able to be emitted.
Object(T),
/// Parser encountered a dead state relative to the given token.
@ -331,13 +328,6 @@ pub enum ParseStatus<T> {
/// If there is no parent context to handle the token,
/// [`Parser`] must yield an error.
Dead(Token),
/// Parsing is complete.
///
/// This should cause an iterator to yield [`None`].
/// If a parser is a combination of multiple [`ParseState`]s,
/// this should transition to the next appropriate state.
Done,
}
/// Result of a parsing operation.
@ -362,8 +352,8 @@ impl<T> From<ParseStatus<T>> for Parsed<T> {
match status {
ParseStatus::Incomplete => Parsed::Incomplete,
ParseStatus::Object(x) => Parsed::Object(x),
ParseStatus::Dead(_) | ParseStatus::Done => {
unreachable!("Dead/Done status must be filtered by Parser")
ParseStatus::Dead(_) => {
unreachable!("Dead status must be filtered by Parser")
}
}
}
@ -394,13 +384,13 @@ pub mod test {
fn parse_token(&mut self, tok: Token) -> ParseStateResult<Self> {
match tok {
Token::AttrEnd(..) => {
Token::Comment(..) => {
*self = Self::Done;
}
Token::Close(..) => {
return Err(EchoStateError::InnerError(tok))
}
Token::Comment(..) => return Ok(ParseStatus::Dead(tok)),
Token::Text(..) => return Ok(ParseStatus::Dead(tok)),
_ => {}
}
@ -433,14 +423,15 @@ pub mod test {
#[test]
fn successful_parse_in_accepting_state_with_spans() {
// EchoState is placed into a Done state given AttrEnd.
let mut toks = [Token::AttrEnd(DS)].into_iter();
// EchoState is placed into a Done state given Comment.
let tok = Token::Comment("foo".into(), DS);
let mut toks = once(tok.clone());
let mut sut = Sut::from(&mut toks);
// The first token should be processed normally.
// EchoState proxies the token back.
assert_eq!(Some(Ok(Parsed::Object(Token::AttrEnd(DS)))), sut.next());
assert_eq!(Some(Ok(Parsed::Object(tok))), sut.next());
// This is now the end of the token stream,
// which should be okay provided that the first token put us into
@ -494,11 +485,12 @@ pub mod test {
fn fails_when_parser_is_finalized_in_non_accepting_state() {
// Set up so that we have a single token that we can use for
// recovery as part of the same iterator.
let recovery = Token::Comment("recov".into(), DS);
let mut toks = [
// Used purely to populate a Span.
Token::Close(None, DS),
// Recovery token here:
Token::AttrEnd(DS),
recovery.clone(),
]
.into_iter();
@ -524,7 +516,7 @@ pub mod test {
// `toks` above is set up already for this,
// which allows us to assert that we received back the same `sut`.
let mut sut = result.unwrap_err().0;
assert_eq!(Some(Ok(Parsed::Object(Token::AttrEnd(DS)))), sut.next());
assert_eq!(Some(Ok(Parsed::Object(recovery))), sut.next());
// And so we should now be in an accepting state,
// able to finalize.
@ -533,8 +525,8 @@ pub mod test {
#[test]
fn unhandled_dead_state_results_in_error() {
// A comment will cause our parser to return Dead.
let tok = Token::Comment("dead".into(), DS);
// A Text will cause our parser to return Dead.
let tok = Token::Text("dead".into(), DS);
let mut toks = once(tok.clone());
let mut sut = Sut::from(&mut toks);

View File

@ -201,11 +201,6 @@ fn empty_element_with_attrs_from_toks() {
assert_eq!(sut.next(), None);
}
// We should accommodate missing AttrEnd in an element context so that we
// can parse generated XIR without having to emit AttrEnd if we know it
// will not be necessary.
// I may come to regret that accommodation after we have to go back and add
// AttrEnd to systems that weren't providing it.
#[test]
fn child_element_after_attrs() {
let name = ("ns", "elem").unwrap_into();
@ -217,7 +212,6 @@ fn child_element_after_attrs() {
Token::Open(name, *S),
Token::AttrName(attr, *S),
Token::AttrValue(val, *S2),
// No AttrEnd
Token::Open(child, *S),
Token::Close(None, *S2),
Token::Close(Some(name), *S3),
@ -408,7 +402,6 @@ fn parser_attr_multiple() {
Token::AttrValue(val1, *S2),
Token::AttrName(attr2, *S2),
Token::AttrValue(val2, *S3),
Token::AttrEnd(*S3),
// Token that we should _not_ hit.
Token::Text("nohit".into(), *S),
]
@ -418,13 +411,16 @@ fn parser_attr_multiple() {
assert_eq!(sut.next(), Some(Ok(Attr::new(attr1, val1, (*S, *S2)))));
assert_eq!(sut.next(), Some(Ok(Attr::new(attr2, val2, (*S2, *S3)))));
assert_eq!(sut.next(), None);
// Parsing must stop after AttrEnd,
// Parsing must stop after the last attribute,
// after which some other parser can continue on the same token
// stream.
// Even if there _were_ more attributes,
// this parser is spent and cannot continue.
drop(sut);
assert_eq!(toks.next(), Some(Token::Text("nohit".into(), *S)));
// stream
// (using this token as a lookahead).
assert_eq!(
sut.next(),
Some(Err(ParseError::UnexpectedToken(Token::Text(
"nohit".into(),
*S
))))
);
}

View File

@ -255,9 +255,6 @@ impl<S: Escaper> XmlWriter<S> for Token {
Ok(W::AttrFragmentAdjacent)
}
// AttrEnd is ignored by the writer (and is optional).
(Self::AttrEnd(..), x) => Ok(x),
// TODO: We have no way of knowing if text should be formatted
// as CData,
// which may also be beneficial to avoid escaping if we
@ -497,18 +494,6 @@ mod test {
Ok(())
}
// AttrEnd does not even need to be in a semantically valid position; we
// just ignore it entirely.
#[test]
fn ignores_attr_end() -> TestResult {
let result = Token::AttrEnd(*S)
.write_new(WriterState::NodeOpen, &MockEscaper::default())?;
assert_eq!(result.0, b"");
assert_eq!(result.1, WriterState::NodeOpen);
Ok(())
}
#[test]
fn writes_text() -> TestResult {
let text = "test unescaped".intern();
@ -573,7 +558,6 @@ mod test {
Token::Open(root, *S),
Token::AttrName(("an", "attr").try_into()?, *S),
Token::AttrValue("value".intern(), *S),
Token::AttrEnd(*S),
Token::Text("text".intern(), *S),
Token::Open(("c", "child").try_into()?, *S),
Token::Whitespace(" ".try_into()?, *S),