tamer: xir::reader: Opening and closing tag whitespace

Non-attribute and non-empty start/end tags will have their whitespace
as part of the produced span.  This sets us up for a following change that
will allow for deriving the name span from this span given a QName, which
gives us a span that both represents the entire XIR token and allows
deriving the element name.

An accurate token span is necessary for parsing errors where an element was
not expected, while an element name span is more appropriate for issues of
grammar and semantic errors that deal not with the fact that an element was
encountered, but _what_ element was encountered.

DEV-7145
main
Mike Gerwitz 2022-06-22 15:10:49 -04:00
parent e5c8a218c3
commit 2fafc331a1
2 changed files with 115 additions and 26 deletions

View File

@ -165,9 +165,14 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
)),
QuickXmlEvent::End(ele) => Some({
// </foo>
// [----] name + '<' + '/' + '>'
let span = ctx.span_or_zz(prev_pos, ele.name().len() + 3);
// Only whitespace is permitted following the element
// name,
// so we can simply take the delta of the buffer pos.
//
// </foo >
// [------] name + '<' + '/' + " >"
let len = self.reader.buffer_position() - prev_pos;
let span = ctx.span_or_zz(prev_pos, len);
ele.name()
.try_into()
@ -342,17 +347,21 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
.try_into()
.map_err(Error::from_with_span(ctx.span_or_zz(pos + 1, len)))
.and_then(|qname| {
let has_attrs = ele.attributes_raw().len() > 0;
let noattr_add: usize = (!has_attrs && !empty_tag).into();
// The whitespace check is to handle input like this:
// <foo />
// ^ whitespace making `attributes_raw().len` > 0
let has_attrs = ele
.attributes_raw()
.iter()
.find(|b| !Self::is_whitespace(**b))
.is_some();
// <tag ... /> <tag/>
// [--] name + '<' [--] `noattr_add` must be 0
//
// <tag>...</tag> <tag ...>...</tag>
// [---] name + '<' + '>' [--] name + '<'
let span = ctx.span_or_zz(pos, len + 1 + noattr_add);
if has_attrs {
// The tail is anything following the last byte of the QName
// in a non-empty tag with no attributes.
// For example:
// <foo > <foo> <foo bar="baz">
// ~~~~ tail ~ tail (no tail)
let tail = if has_attrs {
let found = Self::parse_attrs(
escaper,
tokbuf,
@ -365,23 +374,33 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
// Given this input, quick-xml ignores the bytes entirely:
// <foo bar>
// [--] missing `="value"`
//
// The whitespace check is to handle input like this:
// <foo />
// ^ whitespace making `attributes_raw().len` > 0
if !found
&& ele
.attributes_raw()
.iter()
.find(|b| !Self::is_whitespace(**b))
.is_some()
{
if !found {
return Err(Error::AttrValueExpected(
None,
ctx.span_or_zz(pos + ele.len() + 1, 0),
));
}
}
// No tail because of attributes.
0
} else {
match empty_tag {
// Empty tag cannot have a tail.
true => 0,
// The "attributes" buffer represents whitespace,
// so the tail is the number of bytes of
// whitespace plus the closing '>' tag delimiter.
false => ele.attributes_raw().len() + 1,
}
};
// <tag ... /> <tag/>
// [--] name + '<' [--] name + '<'
//
// <tag >...</tag> <tag ...>...</tag>
// [-----] name + '<' + " >" [--] name + '<'
// ~~~ tail
let span = ctx.span_or_zz(pos, len + 1 + tail);
// The first token will be immediately returned
// via the Iterator.
@ -389,7 +408,13 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
})
}
/// quick-xml's whitespace predicate.
/// Whether the byte represents XML whitespace.
///
/// This is quick-xml's whitespace predicate,
/// and corresponds to the
/// [nonterminal `S` in the XML specification][xmlspec-s].
///
/// [xmlspec-s]: https://www.w3.org/TR/xml/#NT-S
fn is_whitespace(b: u8) -> bool {
match b {
b' ' | b'\r' | b'\n' | b'\t' => true,

View File

@ -293,6 +293,70 @@ fn permits_duplicate_attrs() {
);
}
#[test]
fn open_close_no_child() {
new_sut!(sut = r#"<nochild></nochild>"#);
// [-------][--------]
// 0 8`9 18
// A B
// /
// note that this includes '>' when there are no attrs,
// since that results in a more intuitive span
let a = DC.span(0, 9);
let b = DC.span(9, 10);
assert_eq!(
Ok(vec![
O(Token::Open("nochild".unwrap_into(), a)),
O(Token::Close(Some("nochild".unwrap_into()), b)),
]),
sut.collect(),
);
}
// Whitespace is permitted after opening tags
// (`STag` in the XML spec).
#[test]
fn open_close_no_child_open_tag_whitespace() {
new_sut!(sut = r#"<nochild ></nochild>"#);
// [----------][--------]
// 0 11`12 21
// A B
let a = DC.span(0, 12);
let b = DC.span(12, 10);
assert_eq!(
Ok(vec![
O(Token::Open("nochild".unwrap_into(), a)),
O(Token::Close(Some("nochild".unwrap_into()), b)),
]),
sut.collect(),
);
}
// Space after end tags is explicitly permitted by the XML spec
// (`ETag`).
#[test]
fn open_close_no_child_close_tag_whitespace() {
new_sut!(sut = r#"<nochild></nochild >"#);
// [-------][-----------]
// 0 8`9 21
// A B
let a = DC.span(0, 9);
let b = DC.span(9, 13);
assert_eq!(
Ok(vec![
O(Token::Open("nochild".unwrap_into(), a)),
O(Token::Close(Some("nochild".unwrap_into()), b)),
]),
sut.collect(),
);
}
#[test]
fn child_node_self_closing() {
new_sut!(sut = r#"<root><child /></root>"#);