diff --git a/tamer/src/ir/xir.rs b/tamer/src/ir/xir.rs index ffd73661..0dc42050 100644 --- a/tamer/src/ir/xir.rs +++ b/tamer/src/ir/xir.rs @@ -549,6 +549,20 @@ pub enum Token { /// components of generated attribute values. AttrValueFragment(AttrValue, Span), + /// A delimiter indicating that attribute processing has ended and the + /// next token will be either a child node or [`Token::Close`]. + /// + /// This allows for streaming attribute collection without any + /// lookahead, + /// which would otherwise require an iterator supporting a `peek` + /// operation. + /// + /// This is mandatory for _readers_ to produce, + /// but _writers must ignore it and not require it to be present_, + /// allowing for the reduction of token counts for generated XIR in + /// situations where we know that it will not be further parsed. + AttrEnd, + /// Comment node. Comment(Text, Span), diff --git a/tamer/src/ir/xir/reader.rs b/tamer/src/ir/xir/reader.rs index 30fd821c..2525265d 100644 --- a/tamer/src/ir/xir/reader.rs +++ b/tamer/src/ir/xir/reader.rs @@ -205,6 +205,11 @@ impl XmlXirReader { tokbuf.push_front(Token::AttrValue(value, DUMMY_SPAN)); } + // Indicate the end of attributes even if no attributes were output. + // This allows for a reliable delimiter that can be used without + // lookahead for streaming attribute parsing. + tokbuf.push_front(Token::AttrEnd); + Ok(()) } } diff --git a/tamer/src/ir/xir/reader/test.rs b/tamer/src/ir/xir/reader/test.rs index 2a8b2d4d..50228749 100644 --- a/tamer/src/ir/xir/reader/test.rs +++ b/tamer/src/ir/xir/reader/test.rs @@ -64,6 +64,7 @@ fn empty_node_without_prefix_or_attributes() { result.expect("parsing failed"), vec![ Token::Open("empty-node".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), ], ); @@ -86,6 +87,7 @@ fn does_not_resolve_xmlns() { AttrValue::Escaped("noresolve".into()), DUMMY_SPAN ), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), ], ); @@ -108,6 +110,7 @@ fn empty_node_with_prefix_without_attributes_unresolved() { AttrValue::Escaped("noresolve".into()), DUMMY_SPAN ), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), ], ); @@ -146,6 +149,7 @@ fn multiple_attrs_ordered() { Token::AttrValue(AttrValue::Escaped("b".into()), DUMMY_SPAN), Token::AttrName(("b", "baz").unwrap_into(), DUMMY_SPAN), Token::AttrValue(AttrValue::Escaped("c".into()), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), ], ); @@ -167,6 +171,7 @@ fn permits_duplicate_attrs() { Token::AttrValue(AttrValue::Escaped("a".into()), DUMMY_SPAN), Token::AttrName("attr".unwrap_into(), DUMMY_SPAN), Token::AttrValue(AttrValue::Escaped("b".into()), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), ], ); @@ -182,7 +187,9 @@ fn child_node_self_closing() { result.expect("parsing failed"), vec![ Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Open("child".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), ], @@ -199,9 +206,12 @@ fn sibling_nodes() { result.expect("parsing failed"), vec![ Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Open("child".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Open("child".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), ], @@ -218,9 +228,11 @@ fn child_node_with_attrs() { result.expect("parsing failed"), vec![ Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Open("child".unwrap_into(), DUMMY_SPAN), Token::AttrName("foo".unwrap_into(), DUMMY_SPAN), Token::AttrValue(AttrValue::Escaped("bar".into()), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), ], @@ -237,6 +249,7 @@ fn child_text() { result.expect("parsing failed"), vec![ Token::Open("text".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Text(Text::Escaped("foo bar".into()), DUMMY_SPAN), Token::Close(Some("text".unwrap_into()), DUMMY_SPAN), ], @@ -253,8 +266,10 @@ fn mixed_child_content() { result.expect("parsing failed"), vec![ Token::Open("text".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Text(Text::Escaped("foo".into()), DUMMY_SPAN), Token::Open("em".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Text(Text::Escaped("bar".into()), DUMMY_SPAN), Token::Close(Some("em".unwrap_into()), DUMMY_SPAN), Token::Close(Some("text".unwrap_into()), DUMMY_SPAN), @@ -283,8 +298,10 @@ fn mixed_child_content_with_newlines() { vec![ Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN), Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Text(Text::Escaped("\n ".into()), DUMMY_SPAN), Token::Open("child".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), @@ -303,6 +320,7 @@ fn child_cdata() { result.expect("parsing failed"), vec![ Token::Open("cd".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, // Escaped by quick_xml. Token::Text(Text::Escaped("<foo />".into()), DUMMY_SPAN), Token::Close(Some("cd".unwrap_into()), DUMMY_SPAN), @@ -320,8 +338,10 @@ fn mixed_child_text_and_cdata() { result.expect("parsing failed"), vec![ Token::Open("cd".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Text(Text::Escaped("foo".into()), DUMMY_SPAN), Token::Open("bar".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), // Escaped by quick_xml. Token::Text(Text::Escaped("<baz/>".into()), DUMMY_SPAN), @@ -341,6 +361,7 @@ fn comment() { vec![ Token::Comment(Text::Unescaped("root".into()), DUMMY_SPAN), Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Comment(Text::Unescaped("".into()), DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), ], @@ -363,6 +384,7 @@ lines--> result.expect("parsing failed"), vec![ Token::Open("mult".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Comment( Text::Unescaped("comment\non multiple\nlines".into()), DUMMY_SPAN @@ -384,7 +406,9 @@ fn permits_mismatched_tags() { result.expect("parsing failed"), vec![ Token::Open("root".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Open("child".unwrap_into(), DUMMY_SPAN), + Token::AttrEnd, Token::Close(None, DUMMY_SPAN), Token::Close(Some("mismatch".unwrap_into()), DUMMY_SPAN), ], diff --git a/tamer/src/ir/xir/writer.rs b/tamer/src/ir/xir/writer.rs index 9bf8e020..6b1475dc 100644 --- a/tamer/src/ir/xir/writer.rs +++ b/tamer/src/ir/xir/writer.rs @@ -247,6 +247,9 @@ impl XmlWriter for Token { Ok(S::AttrFragmentAdjacent) } + // AttrEnd is ignored by the writer (and is optional). + (Self::AttrEnd, x) => Ok(x), + // Unescaped not yet supported, but you could use CData. ( Self::Text(Text::Escaped(text), _), @@ -481,6 +484,17 @@ mod test { Ok(()) } + // AttrEnd does not even need to be in a semantically valid position; we + // just ignore it entirely. + #[test] + fn ignores_attr_end() -> TestResult { + let result = Token::AttrEnd.write_new(WriterState::NodeOpen)?; + assert_eq!(result.0, b""); + assert_eq!(result.1, WriterState::NodeOpen); + + Ok(()) + } + #[test] fn writes_escaped_text() -> TestResult { // Just to be sure it's not trying to escape when we say it @@ -563,6 +577,7 @@ mod test { Token::Open(root, *S), Token::AttrName(("an", "attr").try_into()?, *S), Token::AttrValue(AttrValue::Escaped("value".intern()), *S), + Token::AttrEnd, Token::Text(Text::Escaped("text".intern()), *S), Token::Open(("c", "child").try_into()?, *S), Token::Whitespace(" ".try_into()?, *S),