tamer: ir::xir::reader: CData parsing

quick_xml provides us the value escaped, so we can just handle this the same
way as Text for now.

In the future, we may want to distinguish between the two so that we can
reconstruct an identical XML document, but at the moment CData isn't used at
all in TAME sources or outputs, and so I'm not going to worry about it for
now.

DEV-10863
main
Mike Gerwitz 2021-10-21 21:55:15 -04:00
parent 8b212959c8
commit fdb8e5998c
2 changed files with 42 additions and 1 deletions

View File

@ -120,7 +120,11 @@ impl<B: BufRead> XmlXirReader<B> {
self.refill_buf()
}
QuickXmlEvent::Text(bytes) => {
// quick_xml gives us escaped bytes for CData,
// so handle them identically.
// The question is whether we'll want to distinguish the two
// in the future to reproduce the source document on write.
QuickXmlEvent::Text(bytes) | QuickXmlEvent::CData(bytes) => {
Some(bytes.intern_utf8().map_err(Error::from).and_then(
|text| Ok(Token::Text(Text::Escaped(text), DUMMY_SPAN)),
))

View File

@ -293,6 +293,43 @@ fn mixed_child_content_with_newlines() {
);
}
#[test]
fn child_cdata() {
let sut = Sut::new(r#"<cd><![CDATA[<foo />]]></cd>"#.as_bytes());
let result = sut.collect::<Result<Vec<_>>>();
assert_eq!(
result.expect("parsing failed"),
vec![
Token::Open("cd".unwrap_into(), DUMMY_SPAN),
// Escaped by quick_xml.
Token::Text(Text::Escaped("&lt;foo /&gt;".into()), DUMMY_SPAN),
Token::Close(Some("cd".unwrap_into()), DUMMY_SPAN),
],
);
}
#[test]
fn mixed_child_text_and_cdata() {
let sut = Sut::new(r#"<cd>foo<bar/><![CDATA[<baz/>]]></cd>"#.as_bytes());
let result = sut.collect::<Result<Vec<_>>>();
assert_eq!(
result.expect("parsing failed"),
vec![
Token::Open("cd".unwrap_into(), DUMMY_SPAN),
Token::Text(Text::Escaped("foo".into()), DUMMY_SPAN),
Token::Open("bar".unwrap_into(), DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
// Escaped by quick_xml.
Token::Text(Text::Escaped("&lt;baz/&gt;".into()), DUMMY_SPAN),
Token::Close(Some("cd".unwrap_into()), DUMMY_SPAN),
],
);
}
// TODO: Enough information for error recovery and reporting.
#[test]
fn node_name_invalid_utf8() {