tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081
main
Mike Gerwitz 2021-11-12 13:59:14 -05:00
parent b1c0783c75
commit 27ba03b59b
13 changed files with 554 additions and 629 deletions

View File

@ -138,8 +138,8 @@ mod writer {
Writer as QuickXmlWriter,
};
use std::borrow::Cow;
use tamer::span::Span;
use tamer::xir::{writer::XmlWriter, Text};
use tamer::{span::Span, xir::DefaultEscaper};
const FRAGMENT: &str = r#"<fragment>
This is pretend fragment text. We need a lot of it.
@ -216,7 +216,7 @@ This is pretend fragment text. We need a lot of it.</fragment>
Token::Close(None, span),
]
.into_iter()
.write(&mut buf, Default::default())
.write(&mut buf, Default::default(), &DefaultEscaper::default())
.unwrap();
});
});
@ -253,7 +253,11 @@ This is pretend fragment text. We need a lot of it.</fragment>
bench.iter(|| {
(0..50).for_each(|_| {
Token::Text(Text::Escaped(frag), span)
.write(&mut buf, Default::default())
.write(
&mut buf,
Default::default(),
&DefaultEscaper::default(),
)
.unwrap();
});
});

View File

@ -25,12 +25,14 @@ use super::xmle::{
xir::lower_iter,
XmleSections,
};
use crate::asg::{Asg, DefaultAsg, IdentObject};
use crate::global;
use crate::obj::xmlo::{AsgBuilder, AsgBuilderState, XmloReader};
use crate::sym::SymbolId;
use crate::sym::{GlobalSymbolIntern, GlobalSymbolResolve};
use crate::xir::writer::XmlWriter;
use crate::{
asg::{Asg, DefaultAsg, IdentObject},
xir::DefaultEscaper,
};
use crate::{
fs::{
Filesystem, FsCanonicalizer, PathFile, VisitOnceFile,
@ -38,6 +40,10 @@ use crate::{
},
ld::xmle::Sections,
};
use crate::{
obj::xmlo::{AsgBuilder, AsgBuilderState, XmloReader},
xir::Escaper,
};
use fxhash::FxBuildHasher;
use petgraph_graphml::GraphMl;
use std::error::Error;
@ -54,11 +60,24 @@ type LinkerAsgBuilderState =
pub fn xmle(package_path: &str, output: &str) -> Result<(), Box<dyn Error>> {
let mut fs = VisitOnceFilesystem::new();
let mut depgraph = LinkerAsg::with_capacity(65536, 65536);
let escaper = {
#[cfg(feature = "wip-xmlo-xir-reader")]
{
DefaultEscaper::default()
}
#[cfg(not(feature = "wip-xmlo-xir-reader"))]
{
// The original POC linker did nothing with escape sequences,
// since it simply shuffles data around and re-outputs as XML.
crate::xir::NullEscaper::default()
}
};
let state = load_xmlo(
package_path,
&mut fs,
&mut depgraph,
&escaper,
AsgBuilderState::new(),
)?;
@ -105,6 +124,7 @@ pub fn xmle(package_path: &str, output: &str) -> Result<(), Box<dyn Error>> {
name.expect("missing root package name"),
relroot.expect("missing root package relroot"),
output,
&escaper,
)?;
Ok(())
@ -113,11 +133,13 @@ pub fn xmle(package_path: &str, output: &str) -> Result<(), Box<dyn Error>> {
pub fn graphml(package_path: &str, output: &str) -> Result<(), Box<dyn Error>> {
let mut fs = VisitOnceFilesystem::new();
let mut depgraph = LinkerAsg::with_capacity(65536, 65536);
let escaper = DefaultEscaper::default();
let _ = load_xmlo(
package_path,
&mut fs,
&mut depgraph,
&escaper,
AsgBuilderState::new(),
)?;
@ -161,10 +183,11 @@ pub fn graphml(package_path: &str, output: &str) -> Result<(), Box<dyn Error>> {
Ok(())
}
fn load_xmlo<'a, P: AsRef<Path>>(
fn load_xmlo<'a, P: AsRef<Path>, S: Escaper>(
path_str: P,
fs: &mut VisitOnceFilesystem<FsCanonicalizer, FxBuildHasher>,
depgraph: &mut LinkerAsg,
escaper: &S,
state: LinkerAsgBuilderState,
) -> Result<LinkerAsgBuilderState, Box<dyn Error>> {
let cfile: PathFile<BufReader<fs::File>> = match fs.open(path_str)? {
@ -186,7 +209,7 @@ fn load_xmlo<'a, P: AsRef<Path>>(
use crate::iter::into_iter_while_ok;
use crate::xir::reader::XmlXirReader;
into_iter_while_ok(XmlXirReader::from(file), |toks| {
into_iter_while_ok(XmlXirReader::new(file, escaper), |toks| {
let xmlo: XmloReader<_> = toks.into();
depgraph.import_xmlo(xmlo, state)
})??
@ -204,22 +227,27 @@ fn load_xmlo<'a, P: AsRef<Path>>(
path_buf.push(str);
path_buf.set_extension("xmlo");
state = load_xmlo(path_buf, fs, depgraph, state)?;
state = load_xmlo(path_buf, fs, depgraph, escaper, state)?;
}
Ok(state)
}
fn output_xmle<'a, S: XmleSections<'a>>(
sorted: S,
fn output_xmle<'a, X: XmleSections<'a>, S: Escaper>(
sorted: X,
name: SymbolId,
relroot: SymbolId,
output: &str,
escaper: &S,
) -> Result<(), Box<dyn Error>> {
let file = fs::File::create(output)?;
let mut buf = BufWriter::new(file);
lower_iter(sorted, name, relroot).write(&mut buf, Default::default())?;
lower_iter(sorted, name, relroot).write(
&mut buf,
Default::default(),
escaper,
)?;
buf.flush()?;
Ok(())

View File

@ -35,7 +35,7 @@ use crate::{
sym::{st::*, SymbolId},
xir::{
iter::{elem_wrap, ElemWrapIter},
AttrValue, QName, Text, Token, XirString,
QName, Text, Token,
},
};
use arrayvec::ArrayVec;
@ -75,28 +75,21 @@ type HeaderIter = array::IntoIter<Token, HEADER_SIZE>;
/// and its immediate child.
#[inline]
fn header(pkg_name: SymbolId, relroot: SymbolId) -> HeaderIter {
// TODO: Introduce newtypes so that we do not have to make unsafe
// assumptions.
let pkg_name_val =
AttrValue::from(unsafe { XirString::assume_fixed(pkg_name) });
let relroot_val =
AttrValue::from(unsafe { XirString::assume_fixed(relroot) });
[
Token::AttrName(QN_XMLNS, LSPAN),
Token::AttrValue(AttrValue::st_uri(URI_LV_RATER), LSPAN),
Token::AttrValue(raw::URI_LV_RATER, LSPAN),
Token::AttrName(QN_XMLNS_PREPROC, LSPAN),
Token::AttrValue(AttrValue::st_uri(URI_LV_PREPROC), LSPAN),
Token::AttrValue(raw::URI_LV_PREPROC, LSPAN),
Token::AttrName(QN_XMLNS_L, LSPAN),
Token::AttrValue(AttrValue::st_uri(URI_LV_LINKER), LSPAN),
Token::AttrValue(raw::URI_LV_LINKER, LSPAN),
Token::AttrName(QN_TITLE, LSPAN),
Token::AttrValue(pkg_name_val, LSPAN),
Token::AttrValue(pkg_name, LSPAN),
Token::AttrName(QN_PROGRAM, LSPAN),
Token::AttrValue(AttrValue::st_cid(L_TRUE), LSPAN),
Token::AttrValue(raw::L_TRUE, LSPAN),
Token::AttrName(QN_NAME, LSPAN),
Token::AttrValue(pkg_name_val, LSPAN),
Token::AttrValue(pkg_name, LSPAN),
Token::AttrName(QN_UUROOTPATH, LSPAN),
Token::AttrValue(relroot_val, LSPAN),
Token::AttrValue(relroot, LSPAN),
]
.into_iter()
}
@ -124,7 +117,7 @@ struct DepListIter<'a> {
/// Constant-size [`Token`] buffer used as a stack.
toks: ArrayVec<Token, DEP_TOK_SIZE>,
/// Relative path to project root.
relroot: AttrValue,
relroot: SymbolId,
}
impl<'a> DepListIter<'a> {
@ -133,11 +126,7 @@ impl<'a> DepListIter<'a> {
Self {
iter,
toks: ArrayVec::new(),
// TODO: we cannot trust that an arbitrary symbol is escaped; this
// needs better typing, along with other things.
relroot: AttrValue::from(unsafe {
XirString::assume_fixed(relroot)
}),
relroot,
}
}
@ -172,10 +161,7 @@ impl<'a> DepListIter<'a> {
if let Some(pkg_name) = src.pkg_name {
// TODO: Introduce newtypes so that we do not have to make unsafe
// assumptions.
let pkg_name_val =
AttrValue::from(unsafe { XirString::assume_fixed(pkg_name) });
self.toks.push(Token::AttrValue(pkg_name_val, LSPAN));
self.toks.push(Token::AttrValue(pkg_name, LSPAN));
self.toks.push(Token::AttrValueFragment(self.relroot, LSPAN));
self.toks.push(Token::AttrName(QN_SRC, LSPAN));
}
@ -199,9 +185,7 @@ impl<'a> DepListIter<'a> {
#[inline]
fn toks_push_attr(&mut self, name: QName, value: Option<SymbolId>) {
if let Some(val) = value {
let attr_val = AttrValue::from(val);
self.toks.push(Token::AttrValue(attr_val, LSPAN));
self.toks.push(Token::AttrValue(val, LSPAN));
self.toks.push(Token::AttrName(name, LSPAN));
}
}
@ -284,11 +268,7 @@ impl MapFromsIter {
self.iter.next().and_then(|from| {
self.toks.push(Token::Close(None, LSPAN));
// TODO
let from_val =
AttrValue::from(unsafe { XirString::assume_fixed(from) });
self.toks.push(Token::AttrValue(from_val, LSPAN));
self.toks.push(Token::AttrValue(from, LSPAN));
self.toks.push(Token::AttrName(QN_NAME, LSPAN));
Some(Token::Open(QN_L_FROM, LSPAN))

View File

@ -256,12 +256,12 @@ fn test_writes_deps() -> TestResult {
assert_eq!(
attrs.find(QN_NAME).and_then(|a| a.value_atom()),
Some(AttrValue::from(ident.name())),
Some(ident.name()),
);
assert_eq!(
attrs.find(QN_TYPE).and_then(|a| a.value_atom()),
Some(AttrValue::from(ident.kind().unwrap().as_sym()))
Some(ident.kind().unwrap().as_sym())
);
let generated = attrs.find(QN_GENERATED).and_then(|a| a.value_atom());
@ -270,17 +270,17 @@ fn test_writes_deps() -> TestResult {
generated: true, ..
}) = ident.src()
{
assert_eq!(generated, Some(AttrValue::from("true".intern())));
assert_eq!(generated, Some("true".intern()));
} else {
assert_eq!(generated, None);
}
if let Some(Source { parent, .. }) = ident.src() {
assert_attr!(attrs, QN_PARENT, parent.map(|x| AttrValue::from(x)),);
assert_attr!(attrs, QN_PARENT, *parent,);
}
if let Some(Source { yields, .. }) = ident.src() {
assert_attr!(attrs, QN_YIELDS, yields.map(|x| AttrValue::from(x)),);
assert_attr!(attrs, QN_YIELDS, *yields,);
}
if let Some(Source {
@ -311,10 +311,7 @@ fn test_writes_deps() -> TestResult {
Some(Attr::Extensible(parts)) => {
assert_eq!(
parts.value_fragments(),
&vec![
(AttrValue::from(relroot), LSPAN),
(AttrValue::from(*pkg_name), LSPAN),
]
&vec![(relroot, LSPAN), (*pkg_name, LSPAN),]
);
}
invalid => panic!("unexpected desc: {:?}", invalid),
@ -327,7 +324,7 @@ fn test_writes_deps() -> TestResult {
assert_attr!(
attrs,
QN_DIM,
Some(AttrValue::from(Into::<SymbolId>::into(*dim))),
Some(Into::<SymbolId>::into(*dim)),
"invalid {:?} @dim",
ident.kind()
);
@ -341,7 +338,7 @@ fn test_writes_deps() -> TestResult {
assert_attr!(
attrs,
QN_DIM,
Some(AttrValue::from(Into::<SymbolId>::into(*dim))),
Some((*dim).into()),
"invalid {:?} @dim",
ident.kind()
);
@ -349,7 +346,7 @@ fn test_writes_deps() -> TestResult {
assert_attr!(
attrs,
QN_DTYPE,
Some(AttrValue::from(Into::<SymbolId>::into(*dtype))),
Some((*dtype).into()),
"invalid {:?} @dtype",
ident.kind()
);
@ -359,7 +356,7 @@ fn test_writes_deps() -> TestResult {
assert_attr!(
attrs,
QN_DTYPE,
Some(AttrValue::from(Into::<SymbolId>::into(*dtype))),
Some((*dtype).into()),
"invalid {:?} @dim",
ident.kind()
);
@ -438,8 +435,8 @@ fn test_writes_map_froms() -> TestResult {
);
});
assert!(found.contains(&AttrValue::from("froma".intern())));
assert!(found.contains(&AttrValue::from("fromb".intern())));
assert!(found.contains(&"froma".intern()));
assert!(found.contains(&"fromb".intern()));
Ok(())
}

View File

@ -50,17 +50,6 @@
//!# }
//! ```
//!
//! However,
//! certain elements cannot fully parse on their own because require
//! important contextual information,
//! such as [`AttrValue`],
//! which requires knowing whether the provided value is escaped.
//! It is important that the caller is diligent in making the proper
//! determination in these cases,
//! otherwise it could result in situations ranging from invalid
//! compiler output to security vulnerabilities
//! (via XML injection).
//!
//! To parse an entire XML document,
//! see [`reader`].
@ -68,19 +57,17 @@ use crate::span::Span;
use crate::sym::{
st_as_sym, CIdentStaticSymbolId, GlobalSymbolIntern,
GlobalSymbolInternBytes, StaticSymbolId, SymbolId, TameIdentStaticSymbolId,
UriStaticSymbolId,
};
use memchr::memchr;
use std::convert::{TryFrom, TryInto};
use std::fmt::Display;
use std::hash::Hash;
use std::ops::Deref;
mod error;
pub use error::Error;
mod string;
pub use string::XirString;
mod escape;
pub use escape::{DefaultEscaper, Escaper, NullEscaper};
pub mod iter;
pub mod pred;
@ -494,47 +481,6 @@ pub enum Text {
Escaped(SymbolId),
}
/// Represents an attribute value and its escaped contents.
///
/// Being explicit about the state of escaping allows us to skip checks when
/// we know that the generated text could not possibly require escaping.
/// This does, however, put the onus on the caller to ensure that they got
/// the escaping status correct.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct AttrValue(XirString);
impl AttrValue {
/// Construct a constant escaped attribute from a static C-style symbol.
pub const fn st_cid(sym: CIdentStaticSymbolId) -> Self {
Self(XirString::st_cid(sym))
}
/// Construct a constant escaped attribute from a static URI symbol.
///
/// URIs are expected _not_ to contain quotes.
pub const fn st_uri(sym: UriStaticSymbolId) -> Self {
Self(XirString::st_uri(sym))
}
}
impl<T: Into<XirString>> From<T> for AttrValue {
fn from(s: T) -> Self {
Self(s.into())
}
}
impl Into<SymbolId> for AttrValue {
fn into(self) -> SymbolId {
self.0.into()
}
}
impl Display for AttrValue {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
/// Lightly-structured XML tokens with associated [`Span`]s.
///
/// This is a streamable IR for XML.
@ -575,7 +521,7 @@ pub enum Token {
AttrName(QName, Span),
/// Element attribute value.
AttrValue(AttrValue, Span),
AttrValue(SymbolId, Span),
/// A portion of an element attribute value.
///
@ -586,7 +532,7 @@ pub enum Token {
/// Since each fragment contains a span,
/// this also potentially gives higher resolution for the origin of
/// components of generated attribute values.
AttrValueFragment(AttrValue, Span),
AttrValueFragment(SymbolId, Span),
/// A delimiter indicating that attribute processing has ended and the
/// next token will be either a child node or [`Token::Close`].

View File

@ -0,0 +1,188 @@
// XIR string escaping and unescaping
//
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//! Escaping and unescaping for writers and readers respectively.
//!
//! An [`Escaper`] is required by XIR readers and writers.
//! An escaper may perform caching to avoid unnecessary work,
//! so it is advantageous to provide the _same_ instance to all readers
//! and writers.
//! [`Escaper`] methods use interior mutability to facilitate this,
//! since TAMER streams lowering operations where possible,
//! meaning that multiple readers and writers will require references
//! to the [`Escaper`].
//!
//! Safety
//! ======
//! The purpose of this type is to provide safety against XML injection by
//! encapsulating all responsibility within a single object.
//! The idea is simple:
//! a [`SymbolId`] _always_ represents an unescaped string.
//! This prevents, primarily,
//!
//! 1. XML injection (via lack of escaping); and
//! 2. Erroneous multiple escape/unescape.
//!
//! This module is the _only_ part of the system that has access to raw,
//! escaped values.
//! Outside of this module,
//! it is assumed that the rest of the system is working with _unescaped_
//! values---afterall,
//! why would other parts of the system not dealing with XML directly
//! take it upon themselves to deal with XML directly?
//! If we permitted retrieving raw escaped [`SymbolId`]s,
//! then we run the risk of that value being used to construct a XIR
//! stream and be subsequently double-encoded upon writing.
use crate::sym::{
GlobalSymbolInternBytes, GlobalSymbolInternUnchecked, GlobalSymbolResolve,
SymbolId,
};
use std::borrow::Cow;
use super::Error;
/// XIR escaper and unescaper.
///
/// Escapers are responsible for parsing XML escape sequences as necessary
/// on read,
/// and properly escaping characters on write.
/// This is the only part of the system defending XIR against XML
/// injection.
///
/// Escapers must use interior mutability for any internal state
/// (e.g. caching),
/// since multiple readers and writers will require references.
pub trait Escaper: Default {
/// Escape raw bytes such that they become suitable for writing into an
/// XML document as text.
///
/// This value must be escaped such that subsequence unescaping
/// (using [`unescape_bytes`](Escaper::unescape_bytes))
/// will result in the same value.
fn escape_bytes(value: &[u8]) -> Cow<[u8]>;
/// Unescape raw bytes such that any relevant escape sequences are
/// parsed into their text representation.
fn unescape_bytes(value: &[u8]) -> Result<Cow<[u8]>, Error>;
/// Escape the given symbol and produce a [`SymbolId`] representing
/// the escaped value suitable for writing.
fn escape(&self, sym: SymbolId) -> SymbolId {
match Self::escape_bytes(sym.lookup_str().as_bytes()) {
// We got back what we sent in,
// so this value is fixed.
Cow::Borrowed(_) => sym,
// The value changed,
// so we must allocate a new symbol.
// SAFETY: The unescaped symbol is valid UTF-8 unless it was
// unsafely allocated.
// Given that escaping does not introduce any invalid UTF-8
// sequences
// (as is trivially verified by reading its implementation),
// we can skip the UTF-8 check.
Cow::Owned(esc) => unsafe { esc[..].intern_utf8_unchecked() },
}
}
/// Unescape the provided raw value and return a [`SymbolId`]
/// representing the unescaped value.
fn unescape_intern<'a>(
&self,
escaped: &'a [u8],
) -> Result<SymbolId, Error> {
Ok(match Self::unescape_bytes(escaped)? {
// We got back what we sent in,
// so this value is fixed.
Cow::Borrowed(orig) => {
debug_assert!(orig == escaped);
orig.intern_utf8()?
}
// The value was rewritten,
// meaning that the original was escaped.
// We can't assume that it's valid UTF-8.
Cow::Owned(unesc) => unesc.intern_utf8()?,
})
}
}
/// Escape and unescape using [`quick_xml`].
#[derive(Debug, Clone, Copy, Default)]
pub struct QuickXmlEscaper {}
impl Escaper for QuickXmlEscaper {
#[inline]
fn escape_bytes(value: &[u8]) -> Cow<[u8]> {
quick_xml::escape::escape(value)
}
#[inline]
fn unescape_bytes(value: &[u8]) -> Result<Cow<[u8]>, Error> {
// For some reason,
// quick-xml has made EscapeError explicitly private to the crate,
// and so it is opaque to us.
// They have, however,
// implemented `From<EscapeError> for Error`,
// which we will use here.
Ok(quick_xml::escape::unescape(value)
.map_err(quick_xml::Error::from)?)
}
}
/// Perform no escaping or unescaping.
///
/// _This should be removed after development of the XIR-based readers!_
#[cfg(not(feature = "wip-xmlo-xir-reader"))]
#[derive(Debug, Clone, Copy, Default)]
pub struct NullEscaper {}
#[cfg(not(feature = "wip-xmlo-xir-reader"))]
impl Escaper for NullEscaper {
#[inline]
fn escape_bytes(value: &[u8]) -> Cow<[u8]> {
Cow::Borrowed(value)
}
#[inline]
fn unescape_bytes(_value: &[u8]) -> Result<Cow<[u8]>, Error> {
panic!("NullEscaper should not be used for unescaping")
}
}
pub type DefaultEscaper = QuickXmlEscaper;
#[cfg(test)]
mod test {
use super::*;
use crate::sym::GlobalSymbolIntern;
// Simple sanity check to ensure that the default escaper actually does
// some sort of escaping.
#[test]
fn default_escaper_escapes() {
let sut = DefaultEscaper::default();
assert_eq!(
"foo&lt;bar".intern(),
sut.escape("foo<bar".intern()).into(),
);
}
}

View File

@ -21,7 +21,7 @@
//!
//! This uses [`quick_xml`] as the parser.
use super::{Error, Token, XirString};
use super::{DefaultEscaper, Error, Escaper, Token};
use crate::{span::DUMMY_SPAN, sym::GlobalSymbolInternBytes, xir::Text};
use quick_xml::{
self,
@ -46,7 +46,11 @@ pub type Result<T> = result::Result<T, Error>;
///
/// [`None`] is returned only on EOF,
/// not on error.
pub struct XmlXirReader<B: BufRead> {
pub struct XmlXirReader<'s, B, S = DefaultEscaper>
where
B: BufRead,
S: Escaper,
{
/// Inner parser.
reader: quick_xml::Reader<B>,
@ -61,10 +65,13 @@ pub struct XmlXirReader<B: BufRead> {
/// after which [`XmlXirReader::refill_buf`] requests another token
/// from `reader`.
tokbuf: VecDeque<Token>,
/// System for unescaping string data.
escaper: &'s S,
}
impl<B: BufRead> XmlXirReader<B> {
pub fn new(reader: B) -> Self {
impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
pub fn new(reader: B, escaper: &'s S) -> Self {
let mut reader = quick_xml::Reader::from_reader(reader);
// XIR must support mismatched tags so that we are able to represent
@ -79,6 +86,8 @@ impl<B: BufRead> XmlXirReader<B> {
// but [`Token`]s are small enough that it likely does not
// matter much.
tokbuf: VecDeque::with_capacity(32),
escaper,
}
}
@ -99,21 +108,25 @@ impl<B: BufRead> XmlXirReader<B> {
Ok(ev) => match ev {
QuickXmlEvent::Empty(ele) => Some(
Self::parse_element_open(&mut self.tokbuf, ele).and_then(
|open| {
// Tag is self-closing, but this does not yet
// handle whitespace before the `/`.
self.tokbuf
.push_front(Token::Close(None, DUMMY_SPAN));
Self::parse_element_open(
&self.escaper,
&mut self.tokbuf,
ele,
)
.and_then(|open| {
// Tag is self-closing, but this does not yet
// handle whitespace before the `/`.
self.tokbuf.push_front(Token::Close(None, DUMMY_SPAN));
Ok(open)
},
),
Ok(open)
}),
),
QuickXmlEvent::Start(ele) => {
Some(Self::parse_element_open(&mut self.tokbuf, ele))
}
QuickXmlEvent::Start(ele) => Some(Self::parse_element_open(
&self.escaper,
&mut self.tokbuf,
ele,
)),
QuickXmlEvent::End(ele) => {
Some(ele.name().try_into().map_err(Error::from).and_then(
@ -156,6 +169,7 @@ impl<B: BufRead> XmlXirReader<B> {
/// buffer,
/// since the intent is to provide that token immediately.
fn parse_element_open(
escaper: &'s S,
tokbuf: &mut VecDeque<Token>,
ele: BytesStart,
) -> Result<Token> {
@ -163,7 +177,7 @@ impl<B: BufRead> XmlXirReader<B> {
.try_into()
.map_err(Error::from)
.and_then(|qname| {
Self::parse_attrs(tokbuf, ele.attributes())?;
Self::parse_attrs(escaper, tokbuf, ele.attributes())?;
// The first token will be immediately returned
// via the Iterator.
@ -178,6 +192,7 @@ impl<B: BufRead> XmlXirReader<B> {
/// This does not yet handle whitespace between attributes,
/// or around `=`.
fn parse_attrs<'a>(
escaper: &'s S,
tokbuf: &mut VecDeque<Token>,
mut attrs: Attributes<'a>,
) -> Result<()> {
@ -199,8 +214,7 @@ impl<B: BufRead> XmlXirReader<B> {
// that's okay as long as we can read it again,
// but we probably should still throw an error if we
// encounter such a situation.
let value =
XirString::from_escaped_raw(attr.value.as_ref())?.into();
let value = escaper.unescape_intern(attr.value.as_ref())?.into();
tokbuf.push_front(Token::AttrName(name, DUMMY_SPAN));
tokbuf.push_front(Token::AttrValue(value, DUMMY_SPAN));
@ -215,7 +229,11 @@ impl<B: BufRead> XmlXirReader<B> {
}
}
impl<B: BufRead> Iterator for XmlXirReader<B> {
impl<'s, B, S> Iterator for XmlXirReader<'s, B, S>
where
B: BufRead,
S: Escaper,
{
type Item = Result<Token>;
/// Produce the next XIR [`Token`] from the input.
@ -230,11 +248,5 @@ impl<B: BufRead> Iterator for XmlXirReader<B> {
}
}
impl<B: BufRead> From<B> for XmlXirReader<B> {
fn from(reader: B) -> Self {
Self::new(reader)
}
}
#[cfg(test)]
mod test;

View File

@ -17,12 +17,14 @@
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
use std::borrow::Cow;
use super::*;
use crate::sym::GlobalSymbolIntern;
use crate::{
convert::ExpectInto,
span::DUMMY_SPAN,
xir::{AttrValue, Text, Token},
xir::{Error, Text, Token},
};
/// These tests use [`quick_xml`] directly,
@ -41,7 +43,24 @@ use crate::{
/// and by relying on certain parsing behavior to eliminate
/// redundant checks.
type Sut<B> = XmlXirReader<B>;
type Sut<'a, B, S> = XmlXirReader<'a, B, S>;
#[derive(Debug, Default)]
struct MockEscaper {}
// Simply adds ":UNESC" as a suffix to the provided byte slice.
impl Escaper for MockEscaper {
fn escape_bytes(_: &[u8]) -> Cow<[u8]> {
unreachable!("Reader should not be escaping!")
}
fn unescape_bytes(value: &[u8]) -> result::Result<Cow<[u8]>, Error> {
let mut unesc = value.to_owned();
unesc.extend_from_slice(b":UNESC");
Ok(Cow::Owned(unesc))
}
}
/// A byte that will be invalid provided that there is either no following
/// UTF-8 byte,
@ -55,9 +74,20 @@ const INVALID_UTF8_BYTE: u8 = 0b11000000u8;
const INVALID_STR: &str =
unsafe { std::str::from_utf8_unchecked(&[INVALID_UTF8_BYTE]) };
macro_rules! new_sut {
($sut:ident = $data:expr) => {
new_sut!(b $sut = $data.as_bytes())
};
(b $sut:ident = $data:expr) => {
let escaper = MockEscaper::default();
let $sut = Sut::new($data, &escaper);
};
}
#[test]
fn empty_node_without_prefix_or_attributes() {
let sut = Sut::new("<empty-node />".as_bytes());
new_sut!(sut = "<empty-node />");
let result = sut.collect::<Result<Vec<_>>>();
@ -74,7 +104,7 @@ fn empty_node_without_prefix_or_attributes() {
// Resolving namespaces is not the concern of XIR.
#[test]
fn does_not_resolve_xmlns() {
let sut = Sut::new(r#"<no-ns xmlns="noresolve" />"#.as_bytes());
new_sut!(sut = r#"<no-ns xmlns="noresolve" />"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -84,7 +114,7 @@ fn does_not_resolve_xmlns() {
Token::Open("no-ns".unwrap_into(), DUMMY_SPAN),
// Since we didn't parse @xmlns, it's still an attribute.
Token::AttrName("xmlns".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("noresolve".intern()), DUMMY_SPAN),
Token::AttrValue("noresolve:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
@ -94,7 +124,7 @@ fn does_not_resolve_xmlns() {
// Resolving namespaces is not the concern of XIR.
#[test]
fn empty_node_with_prefix_without_attributes_unresolved() {
let sut = Sut::new(r#"<x:empty-node xmlns:x="noresolve" />"#.as_bytes());
new_sut!(sut = r#"<x:empty-node xmlns:x="noresolve" />"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -104,7 +134,7 @@ fn empty_node_with_prefix_without_attributes_unresolved() {
vec![
Token::Open(("x", "empty-node").unwrap_into(), DUMMY_SPAN),
Token::AttrName(("xmlns", "x").unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("noresolve".intern()), DUMMY_SPAN),
Token::AttrValue("noresolve:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
@ -115,7 +145,7 @@ fn empty_node_with_prefix_without_attributes_unresolved() {
#[test]
fn prefix_with_empty_local_name_invalid_qname() {
// No local name (trailing colon).
let sut = Sut::new(r#"<x: xmlns:x="testns" />"#.as_bytes());
new_sut!(sut = r#"<x: xmlns:x="testns" />"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -130,7 +160,7 @@ fn prefix_with_empty_local_name_invalid_qname() {
// The order of attributes must be retained.
#[test]
fn multiple_attrs_ordered() {
let sut = Sut::new(r#"<ele foo="a" bar="b" b:baz="c" />"#.as_bytes());
new_sut!(sut = r#"<ele foo="a" bar="b" b:baz="c" />"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -139,11 +169,11 @@ fn multiple_attrs_ordered() {
vec![
Token::Open("ele".unwrap_into(), DUMMY_SPAN),
Token::AttrName("foo".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("a".intern()), DUMMY_SPAN),
Token::AttrValue("a:UNESC".intern(), DUMMY_SPAN),
Token::AttrName("bar".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("b".intern()), DUMMY_SPAN),
Token::AttrValue("b:UNESC".intern(), DUMMY_SPAN),
Token::AttrName(("b", "baz").unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("c".intern()), DUMMY_SPAN),
Token::AttrValue("c:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
@ -154,7 +184,7 @@ fn multiple_attrs_ordered() {
// need to allow it to support e.g. recovery, code formatting, and LSPs.
#[test]
fn permits_duplicate_attrs() {
let sut = Sut::new(r#"<dup attr="a" attr="b" />"#.as_bytes());
new_sut!(sut = r#"<dup attr="a" attr="b" />"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -163,9 +193,9 @@ fn permits_duplicate_attrs() {
vec![
Token::Open("dup".unwrap_into(), DUMMY_SPAN),
Token::AttrName("attr".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("a".intern()), DUMMY_SPAN),
Token::AttrValue("a:UNESC".intern(), DUMMY_SPAN),
Token::AttrName("attr".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("b".intern()), DUMMY_SPAN),
Token::AttrValue("b:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
@ -174,7 +204,7 @@ fn permits_duplicate_attrs() {
#[test]
fn child_node_self_closing() {
let sut = Sut::new(r#"<root><child /></root>"#.as_bytes());
new_sut!(sut = r#"<root><child /></root>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -193,7 +223,7 @@ fn child_node_self_closing() {
#[test]
fn sibling_nodes() {
let sut = Sut::new(r#"<root><child /><child /></root>"#.as_bytes());
new_sut!(sut = r#"<root><child /><child /></root>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -215,7 +245,7 @@ fn sibling_nodes() {
#[test]
fn child_node_with_attrs() {
let sut = Sut::new(r#"<root><child foo="bar" /></root>"#.as_bytes());
new_sut!(sut = r#"<root><child foo="bar" /></root>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -226,7 +256,7 @@ fn child_node_with_attrs() {
Token::AttrEnd,
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrName("foo".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("bar".intern()), DUMMY_SPAN),
Token::AttrValue("bar:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
@ -236,7 +266,7 @@ fn child_node_with_attrs() {
#[test]
fn child_text() {
let sut = Sut::new(r#"<text>foo bar</text>"#.as_bytes());
new_sut!(sut = r#"<text>foo bar</text>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -253,7 +283,7 @@ fn child_text() {
#[test]
fn mixed_child_content() {
let sut = Sut::new(r#"<text>foo<em>bar</em></text>"#.as_bytes());
new_sut!(sut = r#"<text>foo<em>bar</em></text>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -277,13 +307,12 @@ fn mixed_child_content() {
// opening and closing tags of the root node.
#[test]
fn mixed_child_content_with_newlines() {
let sut = Sut::new(
r#"
new_sut!(
sut = r#"
<root>
<child />
</root>
"#
.as_bytes(),
);
let result = sut.collect::<Result<Vec<_>>>();
@ -307,7 +336,7 @@ fn mixed_child_content_with_newlines() {
#[test]
fn child_cdata() {
let sut = Sut::new(r#"<cd><![CDATA[<foo />]]></cd>"#.as_bytes());
new_sut!(sut = r#"<cd><![CDATA[<foo />]]></cd>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -325,7 +354,7 @@ fn child_cdata() {
#[test]
fn mixed_child_text_and_cdata() {
let sut = Sut::new(r#"<cd>foo<bar/><![CDATA[<baz/>]]></cd>"#.as_bytes());
new_sut!(sut = r#"<cd>foo<bar/><![CDATA[<baz/>]]></cd>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -347,7 +376,7 @@ fn mixed_child_text_and_cdata() {
#[test]
fn comment() {
let sut = Sut::new(r#"<!--root--><root><!--<child>--></root>"#.as_bytes());
new_sut!(sut = r#"<!--root--><root><!--<child>--></root>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -365,12 +394,11 @@ fn comment() {
#[test]
fn comment_multiline() {
let sut = Sut::new(
r#"<mult><!--comment
new_sut!(
sut = r#"<mult><!--comment
on multiple
lines-->
</mult>"#
.as_bytes(),
);
let result = sut.collect::<Result<Vec<_>>>();
@ -393,7 +421,7 @@ lines-->
// XIRT handles mismatch errors; XIR must explicitly support them.
#[test]
fn permits_mismatched_tags() {
let sut = Sut::new(r#"<root><child /></mismatch>"#.as_bytes());
new_sut!(sut = r#"<root><child /></mismatch>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -414,7 +442,7 @@ fn permits_mismatched_tags() {
#[test]
fn node_name_invalid_utf8() {
let bytes: &[u8] = &[b'<', INVALID_UTF8_BYTE, b'/', b'>'];
let sut = Sut::new(bytes);
new_sut!(b sut = bytes);
let result = sut.collect::<Result<Vec<_>>>();
@ -434,7 +462,7 @@ fn attr_name_invalid_utf8() {
s.push_str(INVALID_STR);
s.push_str(r#"="value"/>"#);
let sut = Sut::new(s.as_bytes());
new_sut!(sut = s);
let result = sut.collect::<Result<Vec<_>>>();
@ -454,14 +482,28 @@ fn attr_value_invalid_utf8() {
s.push_str(INVALID_STR);
s.push_str(r#""/>"#);
let sut = Sut::new(s.as_bytes());
new_sut!(sut = s);
let result = sut.collect::<Result<Vec<_>>>();
match result {
Ok(_) => panic!("expected failure"),
Err(Error::InvalidUtf8(_, bytes)) => {
assert_eq!(bytes, &[b'b', b'a', b'd', INVALID_UTF8_BYTE]);
assert_eq!(
bytes,
&[
b'b',
b'a',
b'd',
INVALID_UTF8_BYTE,
b':',
b'U',
b'N',
b'E',
b'S',
b'C'
]
);
}
_ => panic!("unexpected failure"),
}

View File

@ -1,321 +0,0 @@
// XIR string and escape context
//
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//! [`XirString`] and escape context.
//!
//! Safety
//! ======
//! The purpose of this type is to provide safety against XML injection by
//! encapsulating all responsibility within a single object.
//! The idea is simple:
//! if you have a safely constructed [`XirString`],
//! it can be safely used in any context without worrying about these
//! critical problems:
//!
//! 1. XML injection (via lack of escaping); and
//! 2. Erroneous multiple escape/unescape.
//!
//! Both of these problems are solved by ensuring that the proper context
//! for a given [`SymbolId`] is always maintained---a
//! symbol is either valid to be written or not.
//! Similarly,
//! we must know whether a symbol is escaped or not to know whether it
//! ought to be unescaped while reading.
//!
//! This context also ensures that we will not erroneously unescape or
//! re-escape values at multiple points in a program,
//! leading to incorrect data at best and vulnerabilities at worst.
//!
//! To ensure this safety,
//! it is important that types understand how to convert between
//! one-another in well-defined ways.
//! It should not be possible "just assume" that a value has already been
//! escaped.
//! Given that,
//! the constructors for this type are private to this module;
//! readers know the escape status and can produce the proper type and
//! internal types know how to translate between one-another,
//! but anything else making those assumptions is considered unsafe.
//!
//! Outside of this module,
//! it is assumed that the rest of the system is working with _unescaped_
//! values---afterall,
//! why would other parts of the system not dealing with XML directly
//! take it upon themselves to deal with XML directly?
//! Given that,
//! other modules can only read unescaped values and construct
//! [`XirString`] using unescaped values.
//! If we permitted retrieving raw escaped [`SymbolId`]s,
//! then we could construct from it another [`XirString`] that is
//! considered to be unescaped,
//! which would result in a double-escape if it were read using
//! [`XirString::escaped`].
use crate::sym::{
CIdentStaticSymbolId, GlobalSymbolInternBytes, GlobalSymbolInternUnchecked,
GlobalSymbolResolve, SymbolId, UriStaticSymbolId,
};
use std::{
borrow::Cow,
fmt::Display,
hash::{Hash, Hasher},
marker::PhantomData,
};
use super::Error;
/// An XML string that requires escaping before writing.
///
/// This type must be used in contexts where writing to an XML document is
/// not safe without proper escaping,
/// and where reading may require unescaping.
///
#[derive(Debug, Clone, Copy)]
pub struct XirString<S: XirStringEscaper = DefaultXirStringEscaper> {
unescaped: SymbolId,
escaped: Option<SymbolId>,
_escaper: PhantomData<S>,
}
// Since we implement Copy,
// ensure this size meets our expectations both as a sanity check and to
// ensure that attention is brought to this if it ever changes.
const_assert!(std::mem::size_of::<XirString>() <= std::mem::size_of::<usize>());
// A note about this type:
// Both fields are optional,
// but it is not valid to have both be `None`.
// To ensure that this is not possible,
// (a) the fields must remain private;
// (b) all constructors must initialize at least one of the fields; and
// (c) mutation must reconstruct using those constructors.
// This makes it possible to prove that the invariant always holds.
impl<S: XirStringEscaper> XirString<S> {
pub(super) fn from_escaped_raw(escaped: &[u8]) -> Result<Self, Error> {
let esc_sym = escaped.intern_utf8()?;
Ok(Self {
escaped: Some(esc_sym),
unescaped: match S::unescape_bytes(escaped)? {
// We got back what we sent in,
// so this value is fixed.
Cow::Borrowed(orig) => {
debug_assert!(orig == escaped);
esc_sym
}
// The value was rewritten,
// meaning that the original was escaped.
// We can't assume that it's valid UTF-8.
Cow::Owned(unesc) => unesc.intern_utf8()?,
},
_escaper: PhantomData,
})
}
pub const fn new_unescaped(sym: SymbolId) -> Self {
Self {
escaped: None,
unescaped: sym,
_escaper: PhantomData,
}
}
const fn new_fixed(sym: SymbolId) -> Self {
Self {
escaped: Some(sym),
unescaped: sym,
_escaper: PhantomData,
}
}
pub const unsafe fn assume_fixed(sym: SymbolId) -> Self {
Self::new_fixed(sym)
}
/// Construct a constant escaped attribute from a static C-style symbol.
pub const fn st_cid(sym: CIdentStaticSymbolId) -> Self {
Self::new_fixed(sym.as_sym())
}
/// Construct a constant escaped attribute from a static URI symbol.
///
/// URIs are expected _not_ to contain quotes.
pub const fn st_uri(sym: UriStaticSymbolId) -> Self {
Self::new_fixed(sym.as_sym())
}
// TODO: This unnecessarily allocates a symbol that'll just be written
// and not needed thereafter.
#[inline]
pub(super) fn into_escaped(self) -> SymbolId {
self.escaped.unwrap_or_else(|| S::escape(self.unescaped))
}
#[inline]
pub fn unescaped(&self) -> SymbolId {
self.unescaped
}
}
impl PartialEq for XirString {
fn eq(&self, other: &Self) -> bool {
self.unescaped == other.unescaped
}
}
impl Eq for XirString {}
impl Hash for XirString {
fn hash<H: Hasher>(&self, state: &mut H) {
self.unescaped.hash(state);
}
}
impl From<SymbolId> for XirString {
fn from(sym: SymbolId) -> Self {
Self::new_unescaped(sym)
}
}
impl Into<SymbolId> for XirString {
fn into(self) -> SymbolId {
self.unescaped
}
}
impl Display for XirString {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.unescaped.fmt(f)
}
}
pub trait XirStringEscaper {
fn escape_bytes(value: &[u8]) -> Cow<[u8]>;
fn unescape_bytes(value: &[u8]) -> Result<Cow<[u8]>, Error>;
fn escape(sym: SymbolId) -> SymbolId {
match Self::escape_bytes(sym.lookup_str().as_bytes()) {
// We got back what we sent in,
// so this value is fixed.
Cow::Borrowed(_) => sym,
// The value changed,
// so we must allocate a new symbol.
// SAFETY: The unescaped symbol is valid UTF-8 unless it was
// unsafely allocated.
// Given that escaping does not introduce any invalid UTF-8
// sequences
// (as is trivially verified by reading its implementation),
// we can skip the UTF-8 check.
Cow::Owned(esc) => unsafe { esc[..].intern_utf8_unchecked() },
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct QuickXmlXirStringEscaper {}
impl XirStringEscaper for QuickXmlXirStringEscaper {
#[inline]
fn escape_bytes(value: &[u8]) -> Cow<[u8]> {
quick_xml::escape::escape(value)
}
#[inline]
fn unescape_bytes(value: &[u8]) -> Result<Cow<[u8]>, Error> {
// For some reason,
// quick-xml has made EscapeError explicitly private to the crate,
// and so it is opaque to us.
// They have, however,
// implemented `From<EscapeError> for Error`,
// which we will use here.
Ok(quick_xml::escape::unescape(value)
.map_err(quick_xml::Error::from)?)
}
}
pub type DefaultXirStringEscaper = QuickXmlXirStringEscaper;
#[cfg(test)]
mod test {
use super::*;
use crate::sym::GlobalSymbolIntern;
type Sut<S> = XirString<S>;
#[test]
fn create_from_and_retrieve_unescaped() {
let sym = "foo".intern();
let sut = Sut::new_unescaped(sym);
// Converting to a symbol yields the _unescaped_ value.
assert_eq!(sym, sut.into());
// An explicit method is also provided when the clarity is deemed
// necessary.
assert_eq!(sym, sut.unescaped());
}
// The unescaped values are used to identify the SUT.
#[test]
fn eq_on_unescape() {
let sym = "equal".intern();
assert_eq!(Sut::new_unescaped(sym), Sut::new_unescaped(sym));
}
#[test]
fn escapes_using_escaper() {
const GIVEN: &str = "str to escape";
const EXPECTED: &str = "ESCAPED";
struct MockEscaper {}
impl XirStringEscaper for MockEscaper {
fn escape_bytes<'a>(value: &'a [u8]) -> Cow<'a, [u8]> {
assert_eq!(GIVEN.as_bytes(), value);
Cow::Owned(EXPECTED.as_bytes().to_owned())
}
fn unescape_bytes(_: &[u8]) -> Result<Cow<[u8]>, Error> {
unreachable!("not used in this test")
}
}
let sut = Sut::<MockEscaper>::new_unescaped(GIVEN.intern());
// Note that this uses the MockEscaper defined above,
// _not_ quick_xml.
assert_eq!(EXPECTED.intern(), sut.into_escaped());
}
// Simple sanity check to ensure that the default escaper actually does
// some sort of escaping.
#[test]
fn default_escaper_escapes() {
assert_eq!(
"foo&lt;bar".intern(),
Sut::<DefaultXirStringEscaper>::new_unescaped("foo<bar".intern())
.into_escaped()
);
}
}

View File

@ -194,8 +194,8 @@
//! For more information,
//! see [`AttrParts`].
use super::{AttrValue, QName, Text, Token, TokenResultStream, TokenStream};
use crate::span::Span;
use super::{QName, Text, Token, TokenResultStream, TokenStream};
use crate::{span::Span, sym::SymbolId};
use std::{fmt::Display, iter, mem::take};
mod attr;
@ -651,7 +651,7 @@ impl Stack {
/// which is responsible for managing future fragments.
///
/// This will cause heap allocation.
fn push_attr_value(self, value: AttrValue, span: Span) -> Result<Self> {
fn push_attr_value(self, value: SymbolId, span: Span) -> Result<Self> {
Ok(match self {
Self::AttrName(head, name, open_span) => {
// This initial capacity can be adjusted after we observe
@ -678,7 +678,7 @@ impl Stack {
/// If the attribute is composed of fragments ([`Stack::AttrFragments`]),
/// this serves as the final fragment and will yield an
/// [`Attr::Extensible`] with no further processing.
fn close_attr(self, value: AttrValue, span: Span) -> Result<Self> {
fn close_attr(self, value: SymbolId, span: Span) -> Result<Self> {
Ok(match self {
Self::AttrName(Some((ele_stack, attr_list)), name, open_span) => {
Self::BuddingAttrList(

View File

@ -23,8 +23,8 @@
//!
//! See [parent module](super) for additional documentation.
use super::{AttrValue, QName};
use crate::span::Span;
use super::QName;
use crate::{span::Span, sym::SymbolId};
/// An attribute.
///
@ -49,7 +49,7 @@ impl Attr {
/// but it can be cheaply converted into [`Attr::Extensible`] via
/// [`Attr::parts`] or [`From`].
#[inline]
pub fn new(name: QName, value: AttrValue, span: (Span, Span)) -> Self {
pub fn new(name: QName, value: SymbolId, span: (Span, Span)) -> Self {
Self::Simple(SimpleAttr::new(name, value, span))
}
@ -78,7 +78,7 @@ impl Attr {
pub fn from_fragments(
name: QName,
name_span: Span,
frags: Vec<(AttrValue, Span)>,
frags: Vec<(SymbolId, Span)>,
) -> Self {
Self::Extensible(AttrParts {
name,
@ -123,10 +123,10 @@ impl Attr {
/// return [`None`] and let the caller decide how to proceed with
/// deriving an atom.
///
/// Since [`AttrValue`] implements [`Copy`],
/// Since [`SymbolId`] implements [`Copy`],
/// this returns an owned value.
#[inline]
pub fn value_atom(&self) -> Option<AttrValue> {
pub fn value_atom(&self) -> Option<SymbolId> {
match self {
Self::Simple(attr) => Some(attr.value),
Self::Extensible(attr) if attr.value_frags.len() == 1 => {
@ -140,11 +140,11 @@ impl Attr {
/// Element attribute with an atomic value.
///
/// This should be used in place of [`AttrParts`] whenever the attribute is
/// a simple [`QName`]/[`AttrValue`] pair.
/// a simple [`QName`]/[`SymbolId`] pair.
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct SimpleAttr {
name: QName,
value: AttrValue,
value: SymbolId,
/// Spans for the attribute name and value respectively.
span: (Span, Span),
}
@ -153,7 +153,7 @@ impl SimpleAttr {
/// Construct a new simple attribute with a name, value, and respective
/// [`Span`]s.
#[inline]
pub fn new(name: QName, value: AttrValue, span: (Span, Span)) -> Self {
pub fn new(name: QName, value: SymbolId, span: (Span, Span)) -> Self {
Self { name, value, span }
}
}
@ -174,7 +174,7 @@ pub struct AttrParts {
///
/// When writing,
/// fragments will be concatenated in order without any delimiters.
value_frags: Vec<(AttrValue, Span)>,
value_frags: Vec<(SymbolId, Span)>,
}
impl AttrParts {
@ -202,7 +202,7 @@ impl AttrParts {
/// and are associated with
/// [`Token::AttrValueFragment`](super::Token::AttrValueFragment).
#[inline]
pub fn push_value(&mut self, value: AttrValue, span: Span) {
pub fn push_value(&mut self, value: SymbolId, span: Span) {
self.value_frags.push((value, span));
}
@ -213,7 +213,7 @@ impl AttrParts {
/// [`AttrParts`],
/// see [`into_fragments`](AttrParts::into_fragments).
#[inline]
pub fn value_fragments(&self) -> &Vec<(AttrValue, Span)> {
pub fn value_fragments(&self) -> &Vec<(SymbolId, Span)> {
&self.value_frags
}
@ -223,7 +223,7 @@ impl AttrParts {
/// This allows the buffer to be re-used for future [`AttrParts`],
/// avoiding additional heap allocations.
#[inline]
pub fn into_fragments(self) -> Vec<(AttrValue, Span)> {
pub fn into_fragments(self) -> Vec<(SymbolId, Span)> {
self.value_frags
}
}
@ -320,7 +320,7 @@ mod test {
#[test]
fn attr_into_attr_parts() {
let name = "attr".unwrap_into();
let value = AttrValue::from("value".intern());
let value = "value".intern();
let attr = SimpleAttr {
name,
@ -347,8 +347,8 @@ mod test {
#[test]
fn push_attr_part() {
let name = "pushattr".unwrap_into();
let value1 = AttrValue::from("first".intern());
let value2 = AttrValue::from("second".intern());
let value1 = "first".intern();
let value2 = "second".intern();
let mut attr = Attr::new_extensible_with_capacity(name, *S, 2).parts();
@ -361,8 +361,8 @@ mod test {
#[test]
fn attr_from_parts() {
let name = "pushattr".unwrap_into();
let value1 = AttrValue::from("first".intern());
let value2 = AttrValue::from("second".intern());
let value1 = "first".intern();
let value2 = "second".intern();
let attr =
Attr::from_fragments(name, *S, vec![(value1, *S), (value2, *S2)])
@ -374,9 +374,9 @@ mod test {
#[test]
fn into_fragments_to_reuse_buffer_for_parts() {
let name = "partbuffer".unwrap_into();
let value1 = AttrValue::from("first".intern());
let value2 = AttrValue::from("second".intern());
let value3 = AttrValue::from("third".intern());
let value1 = "first".intern();
let value2 = "second".intern();
let value3 = "third".intern();
let frags = vec![(value1, *S2), (value2, *S)];

View File

@ -72,10 +72,8 @@ mod attrs {
let a = "a".unwrap_into();
let b = "b".unwrap_into();
let attra =
Attr::new(a, AttrValue::from("a value".intern()), (*S, *S2));
let attrb =
Attr::new(b, AttrValue::from("b value".intern()), (*S, *S2));
let attra = Attr::new(a, "a value".intern(), (*S, *S2));
let attrb = Attr::new(b, "b value".intern(), (*S, *S2));
let attrs = AttrList::from([attra.clone(), attrb.clone()]);
@ -162,10 +160,10 @@ fn empty_element_with_attrs_from_toks() {
let name = ("ns", "elem").unwrap_into();
let attr1 = "a".unwrap_into();
let attr2 = "b".unwrap_into();
let val1 = AttrValue::from("val1".intern());
let val2a = AttrValue::from("val2a".intern());
let val2b = AttrValue::from("val2b".intern());
let val2c = AttrValue::from("val2b".intern());
let val1 = "val1".intern();
let val2a = "val2a".intern();
let val2b = "val2b".intern();
let val2c = "val2b".intern();
let toks = [
Token::Open(name, *S),
@ -217,7 +215,7 @@ fn child_element_after_attrs() {
let name = ("ns", "elem").unwrap_into();
let child = "child".unwrap_into();
let attr = "a".unwrap_into();
let val = AttrValue::from("val".intern());
let val = "val".intern();
let toks = [
Token::Open(name, *S),
@ -301,7 +299,7 @@ fn element_with_child_with_attributes() {
let parent = "parent".unwrap_into();
let child = "child".unwrap_into();
let attr = "attr".unwrap_into();
let value = AttrValue::from("attr value".intern());
let value = "attr value".intern();
let toks = [
Token::Open(parent, *S),
@ -360,7 +358,7 @@ fn element_with_text() {
fn parser_from_filters_incomplete() {
let name = ("ns", "elem").unwrap_into();
let attr = "a".unwrap_into();
let val = AttrValue::from("val1".intern());
let val = "val1".intern();
let toks = [
Token::Open(name, *S),
@ -406,7 +404,7 @@ fn parse_attrs_fails_if_first_token_is_non_attr() {
fn parse_attrs_fails_if_end_before_attr_end() {
let mut toks = [
Token::AttrName("foo".unwrap_into(), *S),
Token::AttrValue(AttrValue::from("bar".intern()), *S),
Token::AttrValue("bar".intern(), *S),
// No Token::AttrEnd
]
.into_iter();
@ -423,7 +421,7 @@ fn parse_attrs_fails_if_missing_attr_end() {
// of Token::AttrEnd.
let mut toks = [
Token::AttrName("foo".unwrap_into(), *S),
Token::AttrValue(AttrValue::from("bar".intern()), *S2),
Token::AttrValue("bar".intern(), *S2),
// No Token::AttrEnd
Token::Close(None, *S3),
]
@ -439,8 +437,8 @@ fn parse_attrs_fails_if_missing_attr_end() {
fn parse_attrs_isolated() {
let attr1 = "one".unwrap_into();
let attr2 = "two".unwrap_into();
let val1 = AttrValue::from("val1".intern());
let val2 = AttrValue::from("val2".intern());
let val1 = "val1".intern();
let val2 = "val2".intern();
let mut toks = [
Token::AttrName(attr1, *S),
@ -476,8 +474,8 @@ fn attr_parser_with_non_attr_token() {
fn parser_attr_multiple() {
let attr1 = "one".unwrap_into();
let attr2 = "two".unwrap_into();
let val1 = AttrValue::from("val1".intern());
let val2 = AttrValue::from("val2".intern());
let val1 = "val1".intern();
let val2 = "val2".intern();
let mut toks = [
Token::AttrName(attr1, *S),

View File

@ -19,9 +19,10 @@
//! Lower XIR stream into an XML byte stream via [`Write`].
use super::{DefaultEscaper, Escaper};
use super::{Error as XirError, QName, Token, TokenStream};
use crate::sym::GlobalSymbolResolve;
use crate::xir::{AttrValue, Text};
use crate::xir::Text;
use std::io::{Error as IoError, Write};
use std::result;
@ -118,7 +119,11 @@ impl WriterState {
/// It uses a finate state machine (FSM),
/// where states are represented by [`WriterState`],
/// to avoid lookahead requirements.
pub trait XmlWriter: Sized {
pub trait XmlWriter<S = DefaultEscaper>
where
S: Escaper,
Self: Sized,
{
/// Write XML representation into the provided buffer.
///
/// The writer acts as a state machine to determine whether previous
@ -133,7 +138,12 @@ pub trait XmlWriter: Sized {
/// If you have a series of writes to perform,
/// consider using an [`Iterator`] implementing [`XmlWriter`].
#[must_use = "Write operation may fail"]
fn write<W: Write>(self, sink: &mut W, prev_state: WriterState) -> Result;
fn write<W: Write>(
self,
sink: &mut W,
prev_state: WriterState,
escaper: &S,
) -> Result;
/// Allocate a new buffer and write into it,
/// returning both the new buffer and the writer state.
@ -147,17 +157,23 @@ pub trait XmlWriter: Sized {
fn write_new(
self,
prev_state: WriterState,
escaper: &S,
) -> Result<(Vec<u8>, WriterState)> {
let mut buf = Vec::<u8>::new();
let state = self.write(&mut buf, prev_state)?;
let state = self.write(&mut buf, prev_state, escaper)?;
Ok((buf, state))
}
}
impl XmlWriter for QName {
impl<S: Escaper> XmlWriter<S> for QName {
#[inline]
fn write<W: Write>(self, sink: &mut W, prev_state: WriterState) -> Result {
fn write<W: Write>(
self,
sink: &mut W,
prev_state: WriterState,
_escaper: &S,
) -> Result {
if let Some(prefix) = self.prefix() {
sink.write(prefix.lookup_str().as_bytes())?;
sink.write(b":")?;
@ -168,77 +184,76 @@ impl XmlWriter for QName {
}
}
impl XmlWriter for Token {
fn write<W: Write>(self, sink: &mut W, prev_state: WriterState) -> Result {
type S = WriterState; // More concise
impl<S: Escaper> XmlWriter<S> for Token {
fn write<W: Write>(
self,
sink: &mut W,
prev_state: WriterState,
escaper: &S,
) -> Result {
type W = WriterState; // More concise
match (self, prev_state) {
(Self::Open(name, _), S::NodeExpected | S::NodeOpen) => {
(Self::Open(name, _), W::NodeExpected | W::NodeOpen) => {
// If a node is still open, then we are a child.
prev_state.close_tag_if_open(sink)?;
sink.write(b"<")?;
name.write(sink, prev_state)?;
name.write(sink, prev_state, escaper)?;
Ok(S::NodeOpen)
Ok(W::NodeOpen)
}
(Self::Close(None, _), S::NodeOpen) => {
(Self::Close(None, _), W::NodeOpen) => {
sink.write(b"/>")?;
Ok(S::NodeExpected)
Ok(W::NodeExpected)
}
(Self::Close(Some(name), _), S::NodeExpected | S::NodeOpen) => {
(Self::Close(Some(name), _), W::NodeExpected | W::NodeOpen) => {
// If open, we're going to produce an element of the form
// `<foo></foo>`.
prev_state.close_tag_if_open(sink)?;
sink.write(b"</")?;
name.write(sink, prev_state)?;
name.write(sink, prev_state, escaper)?;
sink.write(b">")?;
Ok(S::NodeExpected)
Ok(W::NodeExpected)
}
(Self::AttrName(name, _), S::NodeOpen) => {
(Self::AttrName(name, _), W::NodeOpen) => {
sink.write(b" ")?;
name.write(sink, prev_state)?;
name.write(sink, prev_state, escaper)?;
Ok(S::AttrNameAdjacent)
Ok(W::AttrNameAdjacent)
}
(Self::AttrValue(AttrValue(value), _), S::AttrNameAdjacent) => {
(Self::AttrValue(value, _), W::AttrNameAdjacent) => {
sink.write(b"=\"")?;
sink.write(value.into_escaped().lookup_str().as_bytes())?;
sink.write(escaper.escape(value).lookup_str().as_bytes())?;
sink.write(b"\"")?;
Ok(S::NodeOpen)
Ok(W::NodeOpen)
}
(Self::AttrValue(AttrValue(value), _), S::AttrFragmentAdjacent) => {
sink.write(value.into_escaped().lookup_str().as_bytes())?;
(Self::AttrValue(value, _), W::AttrFragmentAdjacent) => {
sink.write(escaper.escape(value).lookup_str().as_bytes())?;
sink.write(b"\"")?;
Ok(S::NodeOpen)
Ok(W::NodeOpen)
}
(
Self::AttrValueFragment(AttrValue(value), _),
S::AttrNameAdjacent,
) => {
(Self::AttrValueFragment(value, _), W::AttrNameAdjacent) => {
sink.write(b"=\"")?;
sink.write(value.into_escaped().lookup_str().as_bytes())?;
sink.write(escaper.escape(value).lookup_str().as_bytes())?;
Ok(S::AttrFragmentAdjacent)
Ok(W::AttrFragmentAdjacent)
}
(
Self::AttrValueFragment(AttrValue(value), _),
S::AttrFragmentAdjacent,
) => {
sink.write(value.into_escaped().lookup_str().as_bytes())?;
(Self::AttrValueFragment(value, _), W::AttrFragmentAdjacent) => {
sink.write(escaper.escape(value).lookup_str().as_bytes())?;
Ok(S::AttrFragmentAdjacent)
Ok(W::AttrFragmentAdjacent)
}
// AttrEnd is ignored by the writer (and is optional).
@ -247,51 +262,51 @@ impl XmlWriter for Token {
// Unescaped not yet supported, but you could use CData.
(
Self::Text(Text::Escaped(text), _),
S::NodeExpected | S::NodeOpen,
W::NodeExpected | W::NodeOpen,
) => {
prev_state.close_tag_if_open(sink)?;
sink.write(text.lookup_str().as_bytes())?;
Ok(S::NodeExpected)
Ok(W::NodeExpected)
}
// Escaped not yet supported, but you could use Text.
(
Self::CData(Text::Unescaped(text), _),
S::NodeExpected | S::NodeOpen,
W::NodeExpected | W::NodeOpen,
) => {
prev_state.close_tag_if_open(sink)?;
sink.write(b"<![CDATA[")?;
sink.write(text.lookup_str().as_bytes())?;
sink.write(b"]]>")?;
Ok(S::NodeExpected)
Ok(W::NodeExpected)
}
// Unescaped not yet supported, since we do not have a use case.
(
Self::Comment(Text::Escaped(comment), _),
S::NodeExpected | S::NodeOpen,
W::NodeExpected | W::NodeOpen,
) => {
prev_state.close_tag_if_open(sink)?;
sink.write(b"<!--")?;
sink.write(comment.lookup_str().as_bytes())?;
sink.write(b"-->")?;
Ok(S::NodeExpected)
Ok(W::NodeExpected)
}
(Self::Whitespace(ws, _), S::NodeOpen) => {
(Self::Whitespace(ws, _), W::NodeOpen) => {
sink.write(ws.lookup_str().as_bytes())?;
Ok(S::NodeOpen)
Ok(W::NodeOpen)
}
// As-of-yet unsupported operations that weren't needed at the
// time of writing, but were planned for in the design of Xir.
(invalid @ Self::AttrName(_, _), S::AttrNameAdjacent)
| (invalid @ Self::Text(Text::Unescaped(_), _), S::NodeExpected)
| (invalid @ Self::CData(Text::Escaped(_), _), S::NodeExpected) => {
(invalid @ Self::AttrName(_, _), W::AttrNameAdjacent)
| (invalid @ Self::Text(Text::Unescaped(_), _), W::NodeExpected)
| (invalid @ Self::CData(Text::Escaped(_), _), W::NodeExpected) => {
Err(Error::Todo(format!("{:?}", invalid), prev_state))
}
@ -306,30 +321,52 @@ impl XmlWriter for Token {
}
}
impl<I: TokenStream> XmlWriter for I {
impl<I: TokenStream, S: Escaper> XmlWriter<S> for I {
fn write<W: Write>(
mut self,
sink: &mut W,
initial_state: WriterState,
escaper: &S,
) -> Result {
self.try_fold(initial_state, |prev_state, tok| {
tok.write(sink, prev_state)
tok.write(sink, prev_state, escaper)
})
}
}
#[cfg(test)]
mod test {
use std::convert::{TryFrom, TryInto};
use std::{
borrow::Cow,
convert::{TryFrom, TryInto},
};
use super::*;
use crate::{
span::Span,
sym::GlobalSymbolIntern,
xir::{AttrValue, QName, Text, Whitespace},
xir::{QName, Text, Whitespace},
};
type TestResult = std::result::Result<(), Error>;
type Esc = DefaultEscaper;
#[derive(Debug, Default)]
struct MockEscaper {}
// Simply adds ":ESC" as a suffix to the provided byte slice.
impl Escaper for MockEscaper {
fn escape_bytes(value: &[u8]) -> Cow<[u8]> {
let mut esc = value.to_owned();
esc.extend_from_slice(b":ESC");
Cow::Owned(esc)
}
fn unescape_bytes(_: &[u8]) -> result::Result<Cow<[u8]>, XirError> {
unreachable!("Writer should not be unescaping!")
}
}
lazy_static! {
static ref S: Span =
@ -339,7 +376,8 @@ mod test {
#[test]
fn writes_beginning_node_tag_without_prefix() -> TestResult {
let name = QName::new_local("no-prefix".try_into()?);
let result = Token::Open(name, *S).write_new(Default::default())?;
let result = Token::Open(name, *S)
.write_new(Default::default(), &Esc::default())?;
assert_eq!(result.0, b"<no-prefix");
assert_eq!(result.1, WriterState::NodeOpen);
@ -350,7 +388,8 @@ mod test {
#[test]
fn writes_beginning_node_tag_with_prefix() -> TestResult {
let name = QName::try_from(("prefix", "element-name"))?;
let result = Token::Open(name, *S).write_new(Default::default())?;
let result = Token::Open(name, *S)
.write_new(Default::default(), &Esc::default())?;
assert_eq!(result.0, b"<prefix:element-name");
assert_eq!(result.1, WriterState::NodeOpen);
@ -361,7 +400,8 @@ mod test {
#[test]
fn closes_open_node_when_opening_another() -> TestResult {
let name = QName::try_from(("p", "another-element"))?;
let result = Token::Open(name, *S).write_new(WriterState::NodeOpen)?;
let result = Token::Open(name, *S)
.write_new(WriterState::NodeOpen, &Esc::default())?;
assert_eq!(result.0, b"><p:another-element");
assert_eq!(result.1, WriterState::NodeOpen);
@ -371,7 +411,8 @@ mod test {
#[test]
fn closes_open_node_as_empty_element() -> TestResult {
let result = Token::Close(None, *S).write_new(WriterState::NodeOpen)?;
let result = Token::Close(None, *S)
.write_new(WriterState::NodeOpen, &Esc::default())?;
assert_eq!(result.0, b"/>");
assert_eq!(result.1, WriterState::NodeExpected);
@ -384,7 +425,7 @@ mod test {
let name = QName::try_from(("a", "closed-element"))?;
let result = Token::Close(Some(name), *S)
.write_new(WriterState::NodeExpected)?;
.write_new(WriterState::NodeExpected, &Esc::default())?;
assert_eq!(result.0, b"</a:closed-element>");
assert_eq!(result.1, WriterState::NodeExpected);
@ -398,8 +439,8 @@ mod test {
fn closes_open_node_with_closing_tag() -> TestResult {
let name = QName::try_from(("b", "closed-element"))?;
let result =
Token::Close(Some(name), *S).write_new(WriterState::NodeOpen)?;
let result = Token::Close(Some(name), *S)
.write_new(WriterState::NodeOpen, &Esc::default())?;
assert_eq!(result.0, b"></b:closed-element>");
assert_eq!(result.1, WriterState::NodeExpected);
@ -411,7 +452,7 @@ mod test {
#[test]
fn whitespace_within_open_node() -> TestResult {
let result = Token::Whitespace(Whitespace::try_from(" \t ")?, *S)
.write_new(WriterState::NodeOpen)?;
.write_new(WriterState::NodeOpen, &Esc::default())?;
assert_eq!(result.0, b" \t ");
assert_eq!(result.1, WriterState::NodeOpen);
@ -425,14 +466,14 @@ mod test {
let name_local = QName::new_local("nons".try_into()?);
// Namespace prefix
let result =
Token::AttrName(name_ns, *S).write_new(WriterState::NodeOpen)?;
let result = Token::AttrName(name_ns, *S)
.write_new(WriterState::NodeOpen, &Esc::default())?;
assert_eq!(result.0, b" some:attr");
assert_eq!(result.1, WriterState::AttrNameAdjacent);
// No namespace prefix
let result =
Token::AttrName(name_local, *S).write_new(WriterState::NodeOpen)?;
let result = Token::AttrName(name_local, *S)
.write_new(WriterState::NodeOpen, &Esc::default())?;
assert_eq!(result.0, b" nons");
assert_eq!(result.1, WriterState::AttrNameAdjacent);
@ -441,12 +482,14 @@ mod test {
#[test]
fn writes_attr_value_when_adjacent_to_attr() -> TestResult {
let value = AttrValue::from("test str".intern());
let value = "test str".intern();
let result = Token::AttrValue(value, *S)
.write_new(WriterState::AttrNameAdjacent)?;
let result = Token::AttrValue(value, *S).write_new(
WriterState::AttrNameAdjacent,
&MockEscaper::default(),
)?;
assert_eq!(result.0, br#"="test str""#);
assert_eq!(result.0, br#"="test str:ESC""#);
assert_eq!(result.1, WriterState::NodeOpen);
Ok(())
@ -454,17 +497,19 @@ mod test {
#[test]
fn writes_attr_value_consisting_of_fragments() -> TestResult {
let value_left = AttrValue::from("left ".intern());
let value_right = AttrValue::from("right".intern());
let value_left = "left".intern();
let value_mid = " mid".intern();
let value_right = " right".intern();
let result = vec![
Token::AttrValueFragment(value_left, *S),
Token::AttrValueFragment(value_mid, *S),
Token::AttrValue(value_right, *S),
]
.into_iter()
.write_new(WriterState::AttrNameAdjacent)?;
.write_new(WriterState::AttrNameAdjacent, &MockEscaper::default())?;
assert_eq!(result.0, br#"="left right""#);
assert_eq!(result.0, br#"="left:ESC mid:ESC right:ESC""#);
assert_eq!(result.1, WriterState::NodeOpen);
Ok(())
@ -474,7 +519,8 @@ mod test {
// just ignore it entirely.
#[test]
fn ignores_attr_end() -> TestResult {
let result = Token::AttrEnd.write_new(WriterState::NodeOpen)?;
let result =
Token::AttrEnd.write_new(WriterState::NodeOpen, &Esc::default())?;
assert_eq!(result.0, b"");
assert_eq!(result.1, WriterState::NodeOpen);
@ -488,13 +534,14 @@ mod test {
let text = Text::Escaped("test > escaped".intern());
// When a node is expected.
let result =
Token::Text(text, *S).write_new(WriterState::NodeExpected)?;
let result = Token::Text(text, *S)
.write_new(WriterState::NodeExpected, &Esc::default())?;
assert_eq!(result.0, b"test > escaped");
assert_eq!(result.1, WriterState::NodeExpected);
// When a node is still open.
let result = Token::Text(text, *S).write_new(WriterState::NodeOpen)?;
let result = Token::Text(text, *S)
.write_new(WriterState::NodeOpen, &Esc::default())?;
assert_eq!(result.0, b">test > escaped");
assert_eq!(result.1, WriterState::NodeExpected);
@ -508,13 +555,14 @@ mod test {
let text = Text::Unescaped("test > unescaped".intern());
// When a node is expected.
let result =
Token::CData(text, *S).write_new(WriterState::NodeExpected)?;
let result = Token::CData(text, *S)
.write_new(WriterState::NodeExpected, &Esc::default())?;
assert_eq!(result.0, b"<![CDATA[test > unescaped]]>");
assert_eq!(result.1, WriterState::NodeExpected);
// When a node is still open.
let result = Token::CData(text, *S).write_new(WriterState::NodeOpen)?;
let result = Token::CData(text, *S)
.write_new(WriterState::NodeOpen, &Esc::default())?;
assert_eq!(result.0, b"><![CDATA[test > unescaped]]>");
assert_eq!(result.1, WriterState::NodeExpected);
@ -528,14 +576,14 @@ mod test {
let comment = Text::Escaped("comment > escaped".intern());
// When a node is expected.
let result =
Token::Comment(comment, *S).write_new(WriterState::NodeExpected)?;
let result = Token::Comment(comment, *S)
.write_new(WriterState::NodeExpected, &Esc::default())?;
assert_eq!(result.0, b"<!--comment > escaped-->");
assert_eq!(result.1, WriterState::NodeExpected);
// When a node is still open.
let result =
Token::Comment(comment, *S).write_new(WriterState::NodeOpen)?;
let result = Token::Comment(comment, *S)
.write_new(WriterState::NodeOpen, &Esc::default())?;
assert_eq!(result.0, b"><!--comment > escaped-->");
assert_eq!(result.1, WriterState::NodeExpected);
@ -545,8 +593,11 @@ mod test {
#[test]
fn unsupported_transition_results_in_error() -> TestResult {
assert!(matches!(
Token::AttrValue(AttrValue::from("".intern()), *S)
.write(&mut vec![], WriterState::NodeExpected),
Token::AttrValue("".intern(), *S).write(
&mut vec![],
WriterState::NodeExpected,
&Esc::default()
),
Err(Error::UnexpectedToken(_, WriterState::NodeExpected)),
));
@ -562,7 +613,7 @@ mod test {
let result = vec![
Token::Open(root, *S),
Token::AttrName(("an", "attr").try_into()?, *S),
Token::AttrValue(AttrValue::from("value".intern()), *S),
Token::AttrValue("value".intern(), *S),
Token::AttrEnd,
Token::Text(Text::Escaped("text".intern()), *S),
Token::Open(("c", "child").try_into()?, *S),
@ -571,7 +622,7 @@ mod test {
Token::Close(Some(root), *S),
]
.into_iter()
.write_new(Default::default())?;
.write_new(Default::default(), &Esc::default())?;
assert_eq!(
result.0,