From 7e6cb2c94825f21fecabd65de10952766d005253 Mon Sep 17 00:00:00 2001 From: Mike Gerwitz Date: Fri, 29 Oct 2021 13:03:53 -0400 Subject: [PATCH] tamer: ir::xir::Token::AttrEnd: New token type The purpose of this token is to implement a lazy streaming attribute collection operation without a token of lookup, which would complicate parsing or require that a TokenStream provide a `peek` method. This is only required for readers to produce, since readers will be feeding data to parsers. I have the writer ignoring it. If you're looking back at this commit, the question is whether this was a bad idea: it introduces inconsistencies into the token stream depending on the context, which can be confusing and error-prone. The intent is to have the parser throw an explicit error if the new token is missing in the context in which it is required, which will safely handle the issue, but does defer it to runtime. But only readers need auditing, and there's only one XIR reader at the moment. DEV-10863 --- tamer/src/ir/xir.rs | 14 ++++++++++++++ tamer/src/ir/xir/reader.rs | 5 +++++ tamer/src/ir/xir/reader/test.rs | 24 ++++++++++++++++++++++++ tamer/src/ir/xir/writer.rs | 15 +++++++++++++++ 4 files changed, 58 insertions(+) diff --git a/tamer/src/ir/xir.rs b/tamer/src/ir/xir.rs index ffd73661..0dc42050 100644 --- a/tamer/src/ir/xir.rs +++ b/tamer/src/ir/xir.rs @@ -549,6 +549,20 @@ pub enum Token { /// components of generated attribute values. AttrValueFragment(AttrValue, Span), + /// A delimiter indicating that attribute processing has ended and the + /// next token will be either a child node or [`Token::Close`]. + /// + /// This allows for streaming attribute collection without any + /// lookahead, + /// which would otherwise require an iterator supporting a `peek` + /// operation. + /// + /// This is mandatory for _readers_ to produce, + /// but _writers must ignore it and not require it to be present_, + /// allowing for the reduction of token counts for generated XIR in + /// situations where we know that it will not be further parsed. + AttrEnd, + /// Comment node. Comment(Text, Span), diff --git a/tamer/src/ir/xir/reader.rs b/tamer/src/ir/xir/reader.rs index 30fd821c..2525265d 100644 --- a/tamer/src/ir/xir/reader.rs +++ b/tamer/src/ir/xir/reader.rs @@ -205,6 +205,11 @@ impl XmlXirReader { tokbuf.push_front(Token::AttrValue(value, DUMMY_SPAN)); } + // Indicate the end of attributes even if no attributes were output. + // This allows for a reliable delimiter that can be used without + // lookahead for streaming attribute parsing. + tokbuf.push_front(Token::AttrEnd); + Ok(()) } } diff --git a/tamer/src/ir/xir/reader/test.rs b/tamer/src/ir/xir/reader/test.rs index 2a8b2d4d..50228749 100644 --- a/tamer/src/ir/xir/reader/test.rs +++ b/tamer/src/ir/xir/reader/test.rs @@ -64,6 +64,7 @@ fn empty_node_without_prefix_or_attributes() { result.expect("parsing failed"), vec![ Token::Open("empty-node".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), ], ); @@ -86,6 +87,7 @@ fn does_not_resolve_xmlns() { AttrValue::Escaped("noresolve".into()), DUMMY_SPAN ), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), ], ); @@ -108,6 +110,7 @@ fn empty_node_with_prefix_without_attributes_unresolved() { AttrValue::Escaped("noresolve".into()), DUMMY_SPAN ), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), ], ); @@ -146,6 +149,7 @@ fn multiple_attrs_ordered() { Token::AttrValue(AttrValue::Escaped("b".into()), DUMMY_SPAN), Token::AttrName(("b", "baz").unwrap_into(), DUMMY_SPAN), Token::AttrValue(AttrValue::Escaped("c".into()), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), ], ); @@ -167,6 +171,7 @@ fn permits_duplicate_attrs() { Token::AttrValue(AttrValue::Escaped("a".into()), DUMMY_SPAN), Token::AttrName("attr".unwrap_into(), DUMMY_SPAN), Token::AttrValue(AttrValue::Escaped("b".into()), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), ], ); @@ -182,7 +187,9 @@ fn child_node_self_closing() { result.expect("parsing failed"), vec![ Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Open("child".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), ], @@ -199,9 +206,12 @@ fn sibling_nodes() { result.expect("parsing failed"), vec![ Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Open("child".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Open("child".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), ], @@ -218,9 +228,11 @@ fn child_node_with_attrs() { result.expect("parsing failed"), vec![ Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Open("child".unwrap_into(), DUMMY_SPAN), Token::AttrName("foo".unwrap_into(), DUMMY_SPAN), Token::AttrValue(AttrValue::Escaped("bar".into()), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), ], @@ -237,6 +249,7 @@ fn child_text() { result.expect("parsing failed"), vec![ Token::Open("text".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Text(Text::Escaped("foo bar".into()), DUMMY_SPAN), Token::Close(Some("text".unwrap_into()), DUMMY_SPAN), ], @@ -253,8 +266,10 @@ fn mixed_child_content() { result.expect("parsing failed"), vec![ Token::Open("text".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Text(Text::Escaped("foo".into()), DUMMY_SPAN), Token::Open("em".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Text(Text::Escaped("bar".into()), DUMMY_SPAN), Token::Close(Some("em".unwrap_into()), DUMMY_SPAN), Token::Close(Some("text".unwrap_into()), DUMMY_SPAN), @@ -283,8 +298,10 @@ fn mixed_child_content_with_newlines() { vec![ Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN), Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Text(Text::Escaped("\n ".into()), DUMMY_SPAN), Token::Open("child".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), @@ -303,6 +320,7 @@ fn child_cdata() { result.expect("parsing failed"), vec![ Token::Open("cd".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, // Escaped by quick_xml. Token::Text(Text::Escaped("<foo />".into()), DUMMY_SPAN), Token::Close(Some("cd".unwrap_into()), DUMMY_SPAN), @@ -320,8 +338,10 @@ fn mixed_child_text_and_cdata() { result.expect("parsing failed"), vec![ Token::Open("cd".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Text(Text::Escaped("foo".into()), DUMMY_SPAN), Token::Open("bar".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), // Escaped by quick_xml. Token::Text(Text::Escaped("<baz/>".into()), DUMMY_SPAN), @@ -341,6 +361,7 @@ fn comment() { vec![ Token::Comment(Text::Unescaped("root".into()), DUMMY_SPAN), Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Comment(Text::Unescaped("".into()), DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), ], @@ -363,6 +384,7 @@ lines--> result.expect("parsing failed"), vec![ Token::Open("mult".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Comment( Text::Unescaped("comment\non multiple\nlines".into()), DUMMY_SPAN @@ -384,7 +406,9 @@ fn permits_mismatched_tags() { result.expect("parsing failed"), vec![ Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Open("child".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Close(Some("mismatch".unwrap_into()), DUMMY_SPAN), ], diff --git a/tamer/src/ir/xir/writer.rs b/tamer/src/ir/xir/writer.rs index 9bf8e020..6b1475dc 100644 --- a/tamer/src/ir/xir/writer.rs +++ b/tamer/src/ir/xir/writer.rs @@ -247,6 +247,9 @@ impl XmlWriter for Token { Ok(S::AttrFragmentAdjacent) } + // AttrEnd is ignored by the writer (and is optional). + (Self::AttrEnd, x) => Ok(x), + // Unescaped not yet supported, but you could use CData. ( Self::Text(Text::Escaped(text), _), @@ -481,6 +484,17 @@ mod test { Ok(()) } + // AttrEnd does not even need to be in a semantically valid position; we + // just ignore it entirely. + #[test] + fn ignores_attr_end() -> TestResult { + let result = Token::AttrEnd.write_new(WriterState::NodeOpen)?; + assert_eq!(result.0, b""); + assert_eq!(result.1, WriterState::NodeOpen); + + Ok(()) + } + #[test] fn writes_escaped_text() -> TestResult { // Just to be sure it's not trying to escape when we say it @@ -563,6 +577,7 @@ mod test { Token::Open(root, *S), Token::AttrName(("an", "attr").try_into()?, *S), Token::AttrValue(AttrValue::Escaped("value".intern()), *S), + Token::AttrEnd, Token::Text(Text::Escaped("text".intern()), *S), Token::Open(("c", "child").try_into()?, *S), Token::Whitespace(" ".try_into()?, *S),