tamer: ir::xir::reader: CData parsing

quick_xml provides us the value escaped, so we can just handle this the same way as Text for now. In the future, we may want to distinguish between the two so that we can reconstruct an identical XML document, but at the moment CData isn't used at all in TAME sources or outputs, and so I'm not going to worry about it for now. DEV-10863
2021-10-21 21:55:15 -04:00 · 2021-10-21 21:55:15 -04:00 · fdb8e5998c
parent 8b212959c8
commit fdb8e5998c
2 changed files with 42 additions and 1 deletions
--- a/tamer/src/ir/xir/reader.rs
+++ b/tamer/src/ir/xir/reader.rs
@ -120,7 +120,11 @@ impl<B: BufRead> XmlXirReader<B> {
                    self.refill_buf()
                }

-                QuickXmlEvent::Text(bytes) => {
+                // quick_xml gives us escaped bytes for CData,
+                //   so handle them identically.
+                // The question is whether we'll want to distinguish the two
+                //   in the future to reproduce the source document on write.
+                QuickXmlEvent::Text(bytes) | QuickXmlEvent::CData(bytes) => {
                    Some(bytes.intern_utf8().map_err(Error::from).and_then(
                        |text| Ok(Token::Text(Text::Escaped(text), DUMMY_SPAN)),
                    ))
--- a/tamer/src/ir/xir/reader/test.rs
+++ b/tamer/src/ir/xir/reader/test.rs
@ -293,6 +293,43 @@ fn mixed_child_content_with_newlines() {
    );
 }

+#[test]
+fn child_cdata() {
+    let sut = Sut::new(r#"<cd><![CDATA[<foo />]]></cd>"#.as_bytes());
+
+    let result = sut.collect::<Result<Vec<_>>>();
+
+    assert_eq!(
+        result.expect("parsing failed"),
+        vec![
+            Token::Open("cd".unwrap_into(), DUMMY_SPAN),
+            // Escaped by quick_xml.
+            Token::Text(Text::Escaped("&lt;foo /&gt;".into()), DUMMY_SPAN),
+            Token::Close(Some("cd".unwrap_into()), DUMMY_SPAN),
+        ],
+    );
+}
+
+#[test]
+fn mixed_child_text_and_cdata() {
+    let sut = Sut::new(r#"<cd>foo<bar/><![CDATA[<baz/>]]></cd>"#.as_bytes());
+
+    let result = sut.collect::<Result<Vec<_>>>();
+
+    assert_eq!(
+        result.expect("parsing failed"),
+        vec![
+            Token::Open("cd".unwrap_into(), DUMMY_SPAN),
+            Token::Text(Text::Escaped("foo".into()), DUMMY_SPAN),
+            Token::Open("bar".unwrap_into(), DUMMY_SPAN),
+            Token::Close(None, DUMMY_SPAN),
+            // Escaped by quick_xml.
+            Token::Text(Text::Escaped("&lt;baz/&gt;".into()), DUMMY_SPAN),
+            Token::Close(Some("cd".unwrap_into()), DUMMY_SPAN),
+        ],
+    );
+}
+
 // TODO: Enough information for error recovery and reporting.
 #[test]
 fn node_name_invalid_utf8() {