tamer: xir::reader: Opening and closing tag whitespace

Non-attribute and non-empty start/end tags will have their whitespace as part of the produced span. This sets us up for a following change that will allow for deriving the name span from this span given a QName, which gives us a span that both represents the entire XIR token and allows deriving the element name. An accurate token span is necessary for parsing errors where an element was not expected, while an element name span is more appropriate for issues of grammar and semantic errors that deal not with the fact that an element was encountered, but _what_ element was encountered. DEV-7145
2022-06-22 15:10:49 -04:00 · 2022-06-22 15:10:49 -04:00 · 2fafc331a1
parent e5c8a218c3
commit 2fafc331a1
2 changed files with 115 additions and 26 deletions
--- a/tamer/src/xir/reader.rs
+++ b/tamer/src/xir/reader.rs
@ -165,9 +165,14 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
                )),

                QuickXmlEvent::End(ele) => Some({
-                    // </foo>
-                    // [----]  name + '<' + '/' + '>'
-                    let span = ctx.span_or_zz(prev_pos, ele.name().len() + 3);
+                    // Only whitespace is permitted following the element
+                    //   name,
+                    //     so we can simply take the delta of the buffer pos.
+                    //
+                    // </foo  >
+                    // [------]  name + '<' + '/' + "  >"
+                    let len = self.reader.buffer_position() - prev_pos;
+                    let span = ctx.span_or_zz(prev_pos, len);

                    ele.name()
                        .try_into()
@ -342,17 +347,21 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
            .try_into()
            .map_err(Error::from_with_span(ctx.span_or_zz(pos + 1, len)))
            .and_then(|qname| {
-                let has_attrs = ele.attributes_raw().len() > 0;
-                let noattr_add: usize = (!has_attrs && !empty_tag).into();
+                // The whitespace check is to handle input like this:
+                //   <foo />
+                //       ^ whitespace making `attributes_raw().len` > 0
+                let has_attrs = ele
+                    .attributes_raw()
+                    .iter()
+                    .find(|b| !Self::is_whitespace(**b))
+                    .is_some();

-                // <tag ... />               <tag/>
-                // [--]  name + '<'          [--] `noattr_add` must be 0
-                //
-                // <tag>...</tag>            <tag ...>...</tag>
-                // [---] name + '<' + '>'    [--] name + '<'
-                let span = ctx.span_or_zz(pos, len + 1 + noattr_add);
-
-                if has_attrs {
+                // The tail is anything following the last byte of the QName
+                //   in a non-empty tag with no attributes.
+                // For example:
+                //   <foo   >             <foo>          <foo bar="baz">
+                //       ~~~~ tail            ~ tail         (no tail)
+                let tail = if has_attrs {
                    let found = Self::parse_attrs(
                        escaper,
                        tokbuf,
@ -365,23 +374,33 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
                    // Given this input, quick-xml ignores the bytes entirely:
                    //   <foo bar>
                    //        [--] missing `="value"`
-                    //
-                    // The whitespace check is to handle input like this:
-                    //   <foo />
-                    //       ^ whitespace making `attributes_raw().len` > 0
-                    if !found
-                        && ele
-                            .attributes_raw()
-                            .iter()
-                            .find(|b| !Self::is_whitespace(**b))
-                            .is_some()
-                    {
+                    if !found {
                        return Err(Error::AttrValueExpected(
                            None,
                            ctx.span_or_zz(pos + ele.len() + 1, 0),
                        ));
                    }
-                }
+
+                    // No tail because of attributes.
+                    0
+                } else {
+                    match empty_tag {
+                        // Empty tag cannot have a tail.
+                        true => 0,
+                        // The "attributes" buffer represents whitespace,
+                        //   so the tail is the number of bytes of
+                        //   whitespace plus the closing '>' tag delimiter.
+                        false => ele.attributes_raw().len() + 1,
+                    }
+                };
+
+                // <tag ... />                   <tag/>
+                // [--] name + '<'               [--] name + '<'
+                //
+                // <tag  >...</tag>              <tag ...>...</tag>
+                // [-----] name + '<' + "  >"    [--] name + '<'
+                //     ~~~ tail
+                let span = ctx.span_or_zz(pos, len + 1 + tail);

                // The first token will be immediately returned
                //   via the Iterator.
@ -389,7 +408,13 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
            })
    }

-    /// quick-xml's whitespace predicate.
+    /// Whether the byte represents XML whitespace.
+    ///
+    /// This is quick-xml's whitespace predicate,
+    ///   and corresponds to the
+    ///     [nonterminal `S` in the XML specification][xmlspec-s].
+    ///
+    /// [xmlspec-s]: https://www.w3.org/TR/xml/#NT-S
    fn is_whitespace(b: u8) -> bool {
        match b {
            b' ' | b'\r' | b'\n' | b'\t' => true,
--- a/tamer/src/xir/reader/test.rs
+++ b/tamer/src/xir/reader/test.rs
@ -293,6 +293,70 @@ fn permits_duplicate_attrs() {
    );
 }

+#[test]
+fn open_close_no_child() {
+    new_sut!(sut = r#"<nochild></nochild>"#);
+    //                [-------][--------]
+    //                0       8`9      18
+    //                    A        B
+    //                  /
+    //    note that this includes '>' when there are no attrs,
+    //       since that results in a more intuitive span
+
+    let a = DC.span(0, 9);
+    let b = DC.span(9, 10);
+
+    assert_eq!(
+        Ok(vec![
+            O(Token::Open("nochild".unwrap_into(), a)),
+            O(Token::Close(Some("nochild".unwrap_into()), b)),
+        ]),
+        sut.collect(),
+    );
+}
+
+// Whitespace is permitted after opening tags
+//   (`STag` in the XML spec).
+#[test]
+fn open_close_no_child_open_tag_whitespace() {
+    new_sut!(sut = r#"<nochild   ></nochild>"#);
+    //                [----------][--------]
+    //                0         11`12     21
+    //                      A         B
+
+    let a = DC.span(0, 12);
+    let b = DC.span(12, 10);
+
+    assert_eq!(
+        Ok(vec![
+            O(Token::Open("nochild".unwrap_into(), a)),
+            O(Token::Close(Some("nochild".unwrap_into()), b)),
+        ]),
+        sut.collect(),
+    );
+}
+
+// Space after end tags is explicitly permitted by the XML spec
+//   (`ETag`).
+#[test]
+fn open_close_no_child_close_tag_whitespace() {
+    new_sut!(sut = r#"<nochild></nochild   >"#);
+    //                [-------][-----------]
+    //                0       8`9         21
+    //                    A          B
+
+    let a = DC.span(0, 9);
+    let b = DC.span(9, 13);
+
+    assert_eq!(
+        Ok(vec![
+            O(Token::Open("nochild".unwrap_into(), a)),
+            O(Token::Close(Some("nochild".unwrap_into()), b)),
+        ]),
+        sut.collect(),
+    );
+}
+
 #[test]
 fn child_node_self_closing() {
    new_sut!(sut = r#"<root><child /></root>"#);