From 52338223226cbb61e15e09e7d3a044473a148556 Mon Sep 17 00:00:00 2001 From: Mike Gerwitz Date: Mon, 15 Nov 2021 23:47:14 -0500 Subject: [PATCH] tamer: xir: Remove Text enum Like previous commits, this replaces the explicit escaping context with the convention that all values retrieved from `xir` are unescaped on read and escaped on write. Comments are a notable TODO, since we must escape only `--`. CData is also an issue. I had _expected_ to use it as a means to avoid unescaping fragments, but I had forgotten that quick_xml hard-codes escaping on read, so that it can re-use BytesStart! That is terribly unfortunate, and may result in us having to re-implement our own read method in the future to avoid this nonsense. So I'm just leaving it as a TODO for now. DEV-11081 --- tamer/benches/xir.rs | 23 ++++--- tamer/src/ld/xmle/xir.rs | 4 +- tamer/src/ld/xmle/xir/test.rs | 4 +- tamer/src/xir.rs | 49 ++------------- tamer/src/xir/reader.rs | 31 +++++---- tamer/src/xir/reader/test.rs | 67 ++++---------------- tamer/src/xir/tree.rs | 27 ++++---- tamer/src/xir/tree/test.rs | 11 ++-- tamer/src/xir/writer.rs | 115 +++++++++++----------------------- 9 files changed, 107 insertions(+), 224 deletions(-) diff --git a/tamer/benches/xir.rs b/tamer/benches/xir.rs index fee3489d..c11a774c 100644 --- a/tamer/benches/xir.rs +++ b/tamer/benches/xir.rs @@ -138,7 +138,7 @@ mod writer { Writer as QuickXmlWriter, }; use std::borrow::Cow; - use tamer::xir::{writer::XmlWriter, Text}; + use tamer::xir::{writer::XmlWriter, Escaper}; use tamer::{span::Span, xir::DefaultEscaper}; const FRAGMENT: &str = r#" @@ -205,6 +205,12 @@ This is pretend fragment text. We need a lot of it. let val1 = "value".intern(); let val2 = "value2".intern(); + // Prime the cache, since BytesStart is already assumed to be + // escaped. We will have cached on read in a real-world scenario. + let escaper = DefaultEscaper::default(); + escaper.escape(val1); + escaper.escape(val2); + bench.iter(|| { (0..1000).for_each(|_| { vec![ @@ -216,7 +222,7 @@ This is pretend fragment text. We need a lot of it. Token::Close(None, span), ] .into_iter() - .write(&mut buf, Default::default(), &DefaultEscaper::default()) + .write(&mut buf, Default::default(), &escaper) .unwrap(); }); }); @@ -250,14 +256,15 @@ This is pretend fragment text. We need a lot of it. let frag: SymbolId = FRAGMENT.intern(); let span = Span::from_byte_interval((0, 0), "path".intern()); + // Prime the cache, since BytesStart is already assumed to be + // escaped. + let escaper = DefaultEscaper::default(); + escaper.escape(frag); + bench.iter(|| { (0..50).for_each(|_| { - Token::Text(Text::Escaped(frag), span) - .write( - &mut buf, - Default::default(), - &DefaultEscaper::default(), - ) + Token::Text(frag, span) + .write(&mut buf, Default::default(), &escaper) .unwrap(); }); }); diff --git a/tamer/src/ld/xmle/xir.rs b/tamer/src/ld/xmle/xir.rs index 23ff31af..c0f4472b 100644 --- a/tamer/src/ld/xmle/xir.rs +++ b/tamer/src/ld/xmle/xir.rs @@ -35,7 +35,7 @@ use crate::{ sym::{st::*, SymbolId}, xir::{ iter::{elem_wrap, ElemWrapIter}, - QName, Text, Token, + QName, Token, }, }; use arrayvec::ArrayVec; @@ -306,7 +306,7 @@ impl Iterator for FragmentIter { fn next(&mut self) -> Option { self.iter .by_ref() - .map(|frag| Token::Text(Text::Escaped(frag), LSPAN)) + .map(|frag| Token::Text(frag, LSPAN)) .next() } } diff --git a/tamer/src/ld/xmle/xir/test.rs b/tamer/src/ld/xmle/xir/test.rs index bc4ad2aa..16cf7a26 100644 --- a/tamer/src/ld/xmle/xir/test.rs +++ b/tamer/src/ld/xmle/xir/test.rs @@ -490,8 +490,8 @@ macro_rules! test_exec_sec { // Order _absolutely_ matters, // since the purpose of the linker is to put things into the correct // order for execution. - assert_eq!(nodes[0].as_text(), Some(&Text::Escaped(frag_a))); - assert_eq!(nodes[1].as_text(), Some(&Text::Escaped(frag_b))); + assert_eq!(Some(frag_a), nodes[0].as_sym()); + assert_eq!(Some(frag_b), nodes[1].as_sym()); assert_eq!(nodes.len(), 2); diff --git a/tamer/src/xir.rs b/tamer/src/xir.rs index 07d2dea4..93b5a0ae 100644 --- a/tamer/src/xir.rs +++ b/tamer/src/xir.rs @@ -297,10 +297,9 @@ impl TryFrom<&str> for Whitespace { } } -impl From for Text { +impl From for SymbolId { fn from(ws: Whitespace) -> Self { - // Whitespace needs no escaping - Self::Escaped(ws.0) + ws.0 } } @@ -450,37 +449,6 @@ impl Display for QName { } } -/// Represents text and its escaped state. -/// -/// Being explicit about the state of escaping allows us to skip checks when -/// we know that the generated text could not possibly require escaping. -/// This does, however, put the onus on the caller to ensure that they got -/// the escaping status correct. -/// (TODO: More information on why this burden isn"t all that bad, -/// despite the risk.) -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Text { - /// Text node that requires escaping. - /// - /// Unescaped text requires further processing before writing. - /// - /// Note that, - /// since the unescaped text is interned, - /// it may be wasteful to intern a large text node with the intent of - /// escaping and re-interning later. - /// Instead, - /// if escaping is only needed for writing, - /// it is likely better to leave it to the writer to escape, - /// which does _not_ require interning of the resulting string. - Unescaped(SymbolId), - - /// Text node that either has already been escaped or is known not to - /// require escaping. - /// - /// Escaped text can be written as-is without any further processing. - Escaped(SymbolId), -} - /// Lightly-structured XML tokens with associated [`Span`]s. /// /// This is a streamable IR for XML. @@ -549,23 +517,21 @@ pub enum Token { AttrEnd, /// Comment node. - Comment(Text, Span), + Comment(SymbolId, Span), /// Character data as part of an element. /// /// See also [`CData`](Token::CData) variant. - Text(Text, Span), + Text(SymbolId, Span), /// CData node (``). /// - /// See also [`Text`](Token::Text) variant. - /// /// _Warning: It is up to the caller to ensure that the string `]]>` is /// not present in the text!_ /// This is intended for reading existing XML data where CData is /// already present, /// not for producing new CData safely! - CData(Text, Span), + CData(SymbolId, Span), /// Similar to `Text`, /// but intended for use where only whitespace is allowed, @@ -704,10 +670,7 @@ mod test { #[test] fn whitespace_as_text() -> TestResult { - assert_eq!( - Text::Escaped(" ".intern()), - Whitespace::try_from(" ")?.into(), - ); + assert_eq!(" ".intern(), Whitespace::try_from(" ")?.into(),); Ok(()) } diff --git a/tamer/src/xir/reader.rs b/tamer/src/xir/reader.rs index 2026963b..f8821bdf 100644 --- a/tamer/src/xir/reader.rs +++ b/tamer/src/xir/reader.rs @@ -22,7 +22,7 @@ //! This uses [`quick_xml`] as the parser. use super::{DefaultEscaper, Error, Escaper, Token}; -use crate::{span::DUMMY_SPAN, sym::GlobalSymbolInternBytes, xir::Text}; +use crate::{span::DUMMY_SPAN, sym::GlobalSymbolInternBytes}; use quick_xml::{ self, events::{attributes::Attributes, BytesStart, Event as QuickXmlEvent}, @@ -140,21 +140,26 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> { self.refill_buf() } - // quick_xml gives us escaped bytes for CData, - // so handle them identically. - // The question is whether we'll want to distinguish the two - // in the future to reproduce the source document on write. - QuickXmlEvent::Text(bytes) | QuickXmlEvent::CData(bytes) => { - Some(bytes.intern_utf8().map_err(Error::from).and_then( - |text| Ok(Token::Text(Text::Escaped(text), DUMMY_SPAN)), - )) - } + // quick_xml _escapes_ the unescaped CData before handing it + // off to us, + // which is a complete waste since we'd just have to + // unescape it again. + QuickXmlEvent::CData(bytes) => todo!("CData: {:?}", bytes), + + QuickXmlEvent::Text(bytes) => Some( + bytes + .intern_utf8() + .map_err(Error::from) + .and_then(|sym| self.escaper.unescape(sym)) + .map(|unesc| Token::Text(unesc, DUMMY_SPAN)), + ), // Comments are _not_ returned escaped. QuickXmlEvent::Comment(bytes) => Some( - bytes.intern_utf8().map_err(Error::from).and_then(|text| { - Ok(Token::Comment(Text::Unescaped(text), DUMMY_SPAN)) - }), + bytes + .intern_utf8() + .map_err(Error::from) + .map(|text| Token::Comment(text, DUMMY_SPAN)), ), x => todo!("event: {:?}", x), diff --git a/tamer/src/xir/reader/test.rs b/tamer/src/xir/reader/test.rs index ebeab201..93100d86 100644 --- a/tamer/src/xir/reader/test.rs +++ b/tamer/src/xir/reader/test.rs @@ -24,7 +24,7 @@ use crate::sym::GlobalSymbolIntern; use crate::{ convert::ExpectInto, span::DUMMY_SPAN, - xir::{Error, Text, Token}, + xir::{Error, Token}, }; /// These tests use [`quick_xml`] directly, @@ -275,7 +275,7 @@ fn child_text() { vec![ Token::Open("text".unwrap_into(), DUMMY_SPAN), Token::AttrEnd, - Token::Text(Text::Escaped("foo bar".into()), DUMMY_SPAN), + Token::Text("foo bar:UNESC".into(), DUMMY_SPAN), Token::Close(Some("text".unwrap_into()), DUMMY_SPAN), ], ); @@ -292,10 +292,10 @@ fn mixed_child_content() { vec![ Token::Open("text".unwrap_into(), DUMMY_SPAN), Token::AttrEnd, - Token::Text(Text::Escaped("foo".into()), DUMMY_SPAN), + Token::Text("foo:UNESC".into(), DUMMY_SPAN), Token::Open("em".unwrap_into(), DUMMY_SPAN), Token::AttrEnd, - Token::Text(Text::Escaped("bar".into()), DUMMY_SPAN), + Token::Text("bar:UNESC".into(), DUMMY_SPAN), Token::Close(Some("em".unwrap_into()), DUMMY_SPAN), Token::Close(Some("text".unwrap_into()), DUMMY_SPAN), ], @@ -320,56 +320,16 @@ fn mixed_child_content_with_newlines() { assert_eq!( result.expect("parsing failed"), vec![ - Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN), + Token::Text("\n:UNESC".into(), DUMMY_SPAN), Token::Open("root".unwrap_into(), DUMMY_SPAN), Token::AttrEnd, - Token::Text(Text::Escaped("\n ".into()), DUMMY_SPAN), + Token::Text("\n :UNESC".into(), DUMMY_SPAN), Token::Open("child".unwrap_into(), DUMMY_SPAN), Token::AttrEnd, Token::Close(None, DUMMY_SPAN), - Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN), + Token::Text("\n:UNESC".into(), DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), - Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN), - ], - ); -} - -#[test] -fn child_cdata() { - new_sut!(sut = r#"]]>"#); - - let result = sut.collect::>>(); - - assert_eq!( - result.expect("parsing failed"), - vec![ - Token::Open("cd".unwrap_into(), DUMMY_SPAN), - Token::AttrEnd, - // Escaped by quick_xml. - Token::Text(Text::Escaped("<foo />".into()), DUMMY_SPAN), - Token::Close(Some("cd".unwrap_into()), DUMMY_SPAN), - ], - ); -} - -#[test] -fn mixed_child_text_and_cdata() { - new_sut!(sut = r#"foo]]>"#); - - let result = sut.collect::>>(); - - assert_eq!( - result.expect("parsing failed"), - vec![ - Token::Open("cd".unwrap_into(), DUMMY_SPAN), - Token::AttrEnd, - Token::Text(Text::Escaped("foo".into()), DUMMY_SPAN), - Token::Open("bar".unwrap_into(), DUMMY_SPAN), - Token::AttrEnd, - Token::Close(None, DUMMY_SPAN), - // Escaped by quick_xml. - Token::Text(Text::Escaped("<baz/>".into()), DUMMY_SPAN), - Token::Close(Some("cd".unwrap_into()), DUMMY_SPAN), + Token::Text("\n:UNESC".into(), DUMMY_SPAN), ], ); } @@ -383,10 +343,10 @@ fn comment() { assert_eq!( result.expect("parsing failed"), vec![ - Token::Comment(Text::Unescaped("root".into()), DUMMY_SPAN), + Token::Comment("root".into(), DUMMY_SPAN), Token::Open("root".unwrap_into(), DUMMY_SPAN), Token::AttrEnd, - Token::Comment(Text::Unescaped("".into()), DUMMY_SPAN), + Token::Comment("".into(), DUMMY_SPAN), Token::Close(Some("root".unwrap_into()), DUMMY_SPAN), ], ); @@ -408,11 +368,8 @@ lines--> vec![ Token::Open("mult".unwrap_into(), DUMMY_SPAN), Token::AttrEnd, - Token::Comment( - Text::Unescaped("comment\non multiple\nlines".into()), - DUMMY_SPAN - ), - Token::Text(Text::Escaped("\n".into()), DUMMY_SPAN), + Token::Comment("comment\non multiple\nlines".into(), DUMMY_SPAN), + Token::Text("\n:UNESC".into(), DUMMY_SPAN), Token::Close(Some("mult".unwrap_into()), DUMMY_SPAN), ], ); diff --git a/tamer/src/xir/tree.rs b/tamer/src/xir/tree.rs index be11358d..852501be 100644 --- a/tamer/src/xir/tree.rs +++ b/tamer/src/xir/tree.rs @@ -194,7 +194,7 @@ //! For more information, //! see [`AttrParts`]. -use super::{QName, Text, Token, TokenResultStream, TokenStream}; +use super::{QName, Token, TokenResultStream, TokenStream}; use crate::{span::Span, sym::SymbolId}; use std::{fmt::Display, iter, mem::take}; @@ -224,7 +224,7 @@ pub enum Tree { /// /// A text node cannot contain other [`Tree`] elements; /// sibling text nodes must exist within an [`Element`]. - Text(Text, Span), + Text(SymbolId, Span), /// This variant exists purely because `#[non_exhaustive]` has no effect /// within the crate. @@ -246,9 +246,9 @@ impl Into> for Tree { } } -impl Into> for Tree { +impl Into> for Tree { #[inline] - fn into(self) -> Option { + fn into(self) -> Option { match self { Self::Text(text, _) => Some(text), _ => None, @@ -280,22 +280,17 @@ impl Tree { matches!(self, Self::Element(_)) } - /// Yield a reference to the inner value if it is a [`Text`], - /// otherwise [`None`]. + /// Yield a string representation of the element, + /// if applicable. + /// + /// This is incomplete. #[inline] - pub fn as_text<'a>(&'a self) -> Option<&'a Text> { + pub fn as_sym(&self) -> Option { match self { - Self::Text(text, _) => Some(text), + Self::Text(sym, ..) => Some(*sym), _ => None, } } - - /// Yield the inner value if it is a [`Text`], - /// otherwise [`None`]. - #[inline] - pub fn into_text(self) -> Option { - self.into() - } } /// Element node. @@ -729,7 +724,7 @@ impl Stack { /// Appends a text node as a child of an element. /// /// This is valid only for a [`Stack::BuddingElement`]. - fn text(self, value: Text, span: Span) -> Result { + fn text(self, value: SymbolId, span: Span) -> Result { Ok(match self { Self::BuddingElement(mut ele) => { ele.element.children.push(Tree::Text(value, span)); diff --git a/tamer/src/xir/tree/test.rs b/tamer/src/xir/tree/test.rs index c4f7ed66..34075895 100644 --- a/tamer/src/xir/tree/test.rs +++ b/tamer/src/xir/tree/test.rs @@ -31,8 +31,6 @@ lazy_static! { } mod tree { - use crate::xir::Text; - use super::*; #[test] @@ -47,20 +45,19 @@ mod tree { let tree = Tree::Element(ele.clone()); assert_eq!(Some(&ele), tree.as_element()); - assert_eq!(None, tree.as_text()); + assert_eq!(None, Into::>::into(tree)); } #[test] fn text_from_tree() { - let text = Text::Escaped("foo".intern()); + let text = "foo".intern(); let tree = Tree::Text(text, *S); assert!(!tree.is_element()); assert_eq!(None, tree.as_element()); assert_eq!(None, tree.clone().into_element()); - assert_eq!(Some(&text), tree.as_text()); - assert_eq!(Some(text), tree.into_text()); + assert_eq!(Some(text), tree.into()); } } @@ -332,7 +329,7 @@ fn element_with_child_with_attributes() { #[test] fn element_with_text() { let parent = "parent".unwrap_into(); - let text = Text::Escaped("inner text".into()); + let text = "inner text".into(); let toks = [ Token::Open(parent, *S), diff --git a/tamer/src/xir/writer.rs b/tamer/src/xir/writer.rs index 562d3d27..3d5c9b26 100644 --- a/tamer/src/xir/writer.rs +++ b/tamer/src/xir/writer.rs @@ -22,7 +22,6 @@ use super::{DefaultEscaper, Escaper}; use super::{Error as XirError, QName, Token, TokenStream}; use crate::sym::GlobalSymbolResolve; -use crate::xir::Text; use std::io::{Error as IoError, Write}; use std::result; @@ -259,35 +258,21 @@ impl XmlWriter for Token { // AttrEnd is ignored by the writer (and is optional). (Self::AttrEnd, x) => Ok(x), - // Unescaped not yet supported, but you could use CData. - ( - Self::Text(Text::Escaped(text), _), - W::NodeExpected | W::NodeOpen, - ) => { + // TODO: We have no way of knowing if text should be formatted + // as CData, + // which may also be beneficial to avoid escaping if we + // haven't yet encountered the unescaped representation. + (Self::Text(text, _), W::NodeExpected | W::NodeOpen) => { prev_state.close_tag_if_open(sink)?; - sink.write(text.lookup_str().as_bytes())?; + sink.write(escaper.escape(text).lookup_str().as_bytes())?; Ok(W::NodeExpected) } - // Escaped not yet supported, but you could use Text. - ( - Self::CData(Text::Unescaped(text), _), - W::NodeExpected | W::NodeOpen, - ) => { - prev_state.close_tag_if_open(sink)?; - sink.write(b"")?; - - Ok(W::NodeExpected) - } - - // Unescaped not yet supported, since we do not have a use case. - ( - Self::Comment(Text::Escaped(comment), _), - W::NodeExpected | W::NodeOpen, - ) => { + // XXX: While we currently only output comments that have been + // _read_ as comments, + // that will not always be the case and we must escape `--`! + (Self::Comment(comment, _), W::NodeExpected | W::NodeOpen) => { prev_state.close_tag_if_open(sink)?; sink.write(b""); assert_eq!(result.1, WriterState::NodeExpected); // When a node is still open. let result = Token::Comment(comment, *S) - .write_new(WriterState::NodeOpen, &Esc::default())?; + .write_new(WriterState::NodeOpen, &MockEscaper::default())?; assert_eq!(result.0, b">"); assert_eq!(result.1, WriterState::NodeExpected); @@ -596,7 +555,7 @@ mod test { Token::AttrValue("".intern(), *S).write( &mut vec![], WriterState::NodeExpected, - &Esc::default() + &MockEscaper::default() ), Err(Error::UnexpectedToken(_, WriterState::NodeExpected)), )); @@ -615,18 +574,18 @@ mod test { Token::AttrName(("an", "attr").try_into()?, *S), Token::AttrValue("value".intern(), *S), Token::AttrEnd, - Token::Text(Text::Escaped("text".intern()), *S), + Token::Text("text".intern(), *S), Token::Open(("c", "child").try_into()?, *S), Token::Whitespace(" ".try_into()?, *S), Token::Close(None, *S), Token::Close(Some(root), *S), ] .into_iter() - .write_new(Default::default(), &Esc::default())?; + .write_new(Default::default(), &MockEscaper::default())?; assert_eq!( result.0, - br#"text"# + br#"text:ESC"# ); assert_eq!(result.1, WriterState::NodeExpected);