tamer: ir::xir::reader: Text and mixed content

It's nice being able to breeze through changes, since that's been a pretty
rare thing so far, given all the foundational work that has been needed.

This should get us pretty damn close to being able to parse the `xmlo` files
for the reader linker, if we're not there already.

DEV-10863
main
Mike Gerwitz 2021-10-21 21:42:39 -04:00
parent 13a779ec9c
commit 8b212959c8
2 changed files with 74 additions and 2 deletions

View File

@ -22,7 +22,7 @@
//! This uses [`quick_xml`] as the parser.
use super::{AttrValue, Error, Token};
use crate::{span::DUMMY_SPAN, sym::GlobalSymbolInternBytes};
use crate::{ir::xir::Text, span::DUMMY_SPAN, sym::GlobalSymbolInternBytes};
use quick_xml::{
self,
events::{attributes::Attributes, BytesStart, Event as QuickXmlEvent},
@ -120,6 +120,12 @@ impl<B: BufRead> XmlXirReader<B> {
self.refill_buf()
}
QuickXmlEvent::Text(bytes) => {
Some(bytes.intern_utf8().map_err(Error::from).and_then(
|text| Ok(Token::Text(Text::Escaped(text), DUMMY_SPAN)),
))
}
x => todo!("event: {:?}", x),
},
}

View File

@ -20,7 +20,7 @@
use super::*;
use crate::{
convert::ExpectInto,
ir::xir::{AttrValue, Token},
ir::xir::{AttrValue, Text, Token},
span::DUMMY_SPAN,
};
@ -227,6 +227,72 @@ fn child_node_with_attrs() {
);
}
#[test]
fn child_text() {
let sut = Sut::new(r#"<text>foo bar</text>"#.as_bytes());
let result = sut.collect::<Result<Vec<_>>>();
assert_eq!(
result.expect("parsing failed"),
vec![
Token::Open("text".unwrap_into(), DUMMY_SPAN),
Token::Text(Text::Escaped("foo bar".into()), DUMMY_SPAN),
Token::Close(Some("text".unwrap_into()), DUMMY_SPAN),
],
);
}
#[test]
fn mixed_child_content() {
let sut = Sut::new(r#"<text>foo<em>bar</em></text>"#.as_bytes());
let result = sut.collect::<Result<Vec<_>>>();
assert_eq!(
result.expect("parsing failed"),
vec![
Token::Open("text".unwrap_into(), DUMMY_SPAN),
Token::Text(Text::Escaped("foo".into()), DUMMY_SPAN),
Token::Open("em".unwrap_into(), DUMMY_SPAN),
Token::Text(Text::Escaped("bar".into()), DUMMY_SPAN),
Token::Close(Some("em".unwrap_into()), DUMMY_SPAN),
Token::Close(Some("text".unwrap_into()), DUMMY_SPAN),
],
);
}
// This is how XML is typically written; people don't perceive it as mixed,
// even though it is. This intentionally adds newlines before and after the
// opening and closing tags of the root node.
#[test]
fn mixed_child_content_with_newlines() {
let sut = Sut::new(
r#"
<root>
<child />
</root>
"#
.as_bytes(),
);
let result = sut.collect::<Result<Vec<_>>>();
assert_eq!(
result.expect("parsing failed"),
vec![
Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN),
Token::Open("root".unwrap_into(), DUMMY_SPAN),
Token::Text(Text::Escaped("\n ".into()), DUMMY_SPAN),
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::Close(None, DUMMY_SPAN),
Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN),
],
);
}
// TODO: Enough information for error recovery and reporting.
#[test]
fn node_name_invalid_utf8() {