tamer: ir::xir::Token::AttrEnd: New token type

The purpose of this token is to implement a lazy streaming attribute
collection operation without a token of lookup, which would complicate
parsing or require that a TokenStream provide a `peek` method.

This is only required for readers to produce, since readers will be feeding
data to parsers.  I have the writer ignoring it.  If you're looking back at
this commit, the question is whether this was a bad idea: it introduces
inconsistencies into the token stream depending on the context, which can be
confusing and error-prone.

The intent is to have the parser throw an explicit error if the new token is
missing in the context in which it is required, which will safely handle the
issue, but does defer it to runtime.  But only readers need auditing, and
there's only one XIR reader at the moment.

DEV-10863
main
Mike Gerwitz 2021-10-29 13:03:53 -04:00
parent 18ab032ba0
commit 7e6cb2c948
4 changed files with 58 additions and 0 deletions

View File

@ -549,6 +549,20 @@ pub enum Token {
/// components of generated attribute values.
AttrValueFragment(AttrValue, Span),
/// A delimiter indicating that attribute processing has ended and the
/// next token will be either a child node or [`Token::Close`].
///
/// This allows for streaming attribute collection without any
/// lookahead,
/// which would otherwise require an iterator supporting a `peek`
/// operation.
///
/// This is mandatory for _readers_ to produce,
/// but _writers must ignore it and not require it to be present_,
/// allowing for the reduction of token counts for generated XIR in
/// situations where we know that it will not be further parsed.
AttrEnd,
/// Comment node.
Comment(Text, Span),

View File

@ -205,6 +205,11 @@ impl<B: BufRead> XmlXirReader<B> {
tokbuf.push_front(Token::AttrValue(value, DUMMY_SPAN));
}
// Indicate the end of attributes even if no attributes were output.
// This allows for a reliable delimiter that can be used without
// lookahead for streaming attribute parsing.
tokbuf.push_front(Token::AttrEnd);
Ok(())
}
}

View File

@ -64,6 +64,7 @@ fn empty_node_without_prefix_or_attributes() {
result.expect("parsing failed"),
vec![
Token::Open("empty-node".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
);
@ -86,6 +87,7 @@ fn does_not_resolve_xmlns() {
AttrValue::Escaped("noresolve".into()),
DUMMY_SPAN
),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
);
@ -108,6 +110,7 @@ fn empty_node_with_prefix_without_attributes_unresolved() {
AttrValue::Escaped("noresolve".into()),
DUMMY_SPAN
),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
);
@ -146,6 +149,7 @@ fn multiple_attrs_ordered() {
Token::AttrValue(AttrValue::Escaped("b".into()), DUMMY_SPAN),
Token::AttrName(("b", "baz").unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::Escaped("c".into()), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
);
@ -167,6 +171,7 @@ fn permits_duplicate_attrs() {
Token::AttrValue(AttrValue::Escaped("a".into()), DUMMY_SPAN),
Token::AttrName("attr".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::Escaped("b".into()), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
);
@ -182,7 +187,9 @@ fn child_node_self_closing() {
result.expect("parsing failed"),
vec![
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
],
@ -199,9 +206,12 @@ fn sibling_nodes() {
result.expect("parsing failed"),
vec![
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
],
@ -218,9 +228,11 @@ fn child_node_with_attrs() {
result.expect("parsing failed"),
vec![
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrName("foo".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::Escaped("bar".into()), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
],
@ -237,6 +249,7 @@ fn child_text() {
result.expect("parsing failed"),
vec![
Token::Open("text".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Text(Text::Escaped("foo bar".into()), DUMMY_SPAN),
Token::Close(Some("text".unwrap_into()), DUMMY_SPAN),
],
@ -253,8 +266,10 @@ fn mixed_child_content() {
result.expect("parsing failed"),
vec![
Token::Open("text".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Text(Text::Escaped("foo".into()), DUMMY_SPAN),
Token::Open("em".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Text(Text::Escaped("bar".into()), DUMMY_SPAN),
Token::Close(Some("em".unwrap_into()), DUMMY_SPAN),
Token::Close(Some("text".unwrap_into()), DUMMY_SPAN),
@ -283,8 +298,10 @@ fn mixed_child_content_with_newlines() {
vec![
Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN),
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Text(Text::Escaped("\n ".into()), DUMMY_SPAN),
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
@ -303,6 +320,7 @@ fn child_cdata() {
result.expect("parsing failed"),
vec![
Token::Open("cd".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
// Escaped by quick_xml.
Token::Text(Text::Escaped("&lt;foo /&gt;".into()), DUMMY_SPAN),
Token::Close(Some("cd".unwrap_into()), DUMMY_SPAN),
@ -320,8 +338,10 @@ fn mixed_child_text_and_cdata() {
result.expect("parsing failed"),
vec![
Token::Open("cd".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Text(Text::Escaped("foo".into()), DUMMY_SPAN),
Token::Open("bar".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
// Escaped by quick_xml.
Token::Text(Text::Escaped("&lt;baz/&gt;".into()), DUMMY_SPAN),
@ -341,6 +361,7 @@ fn comment() {
vec![
Token::Comment(Text::Unescaped("root".into()), DUMMY_SPAN),
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Comment(Text::Unescaped("<child>".into()), DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
],
@ -363,6 +384,7 @@ lines-->
result.expect("parsing failed"),
vec![
Token::Open("mult".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Comment(
Text::Unescaped("comment\non multiple\nlines".into()),
DUMMY_SPAN
@ -384,7 +406,9 @@ fn permits_mismatched_tags() {
result.expect("parsing failed"),
vec![
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
Token::Close(Some("mismatch".unwrap_into()), DUMMY_SPAN),
],

View File

@ -247,6 +247,9 @@ impl XmlWriter for Token {
Ok(S::AttrFragmentAdjacent)
}
// AttrEnd is ignored by the writer (and is optional).
(Self::AttrEnd, x) => Ok(x),
// Unescaped not yet supported, but you could use CData.
(
Self::Text(Text::Escaped(text), _),
@ -481,6 +484,17 @@ mod test {
Ok(())
}
// AttrEnd does not even need to be in a semantically valid position; we
// just ignore it entirely.
#[test]
fn ignores_attr_end() -> TestResult {
let result = Token::AttrEnd.write_new(WriterState::NodeOpen)?;
assert_eq!(result.0, b"");
assert_eq!(result.1, WriterState::NodeOpen);
Ok(())
}
#[test]
fn writes_escaped_text() -> TestResult {
// Just to be sure it's not trying to escape when we say it
@ -563,6 +577,7 @@ mod test {
Token::Open(root, *S),
Token::AttrName(("an", "attr").try_into()?, *S),
Token::AttrValue(AttrValue::Escaped("value".intern()), *S),
Token::AttrEnd,
Token::Text(Text::Escaped("text".intern()), *S),
Token::Open(("c", "child").try_into()?, *S),
Token::Whitespace(" ".try_into()?, *S),