tamer: xir::escape: Remove XirString in favor of Escaper

This rewrites a good portion of the previous commit.

Rather than explicitly storing whether a given string has been escaped, we
can instead assume that all SymbolIds leaving or entering XIR are unescaped,
because there is no reason for any other part of the system to deal with
such details of XML documents.

Given that, we need only unescape on read and escape on write.  This is
customary, so why didn't I do that to begin with?

The previous commit outlines the reason, mainly being an optimization for
the echo writer that is upcoming.  However, this solution will end up being
better---it's not implemented yet, but we can have a caching layer, such
that the Escaper records a mapping between escaped and unescaped SymbolIds
to avoid work the next time around.  If we share the Escaper between _all_
readers and the writer, the result is that

  1. Duplicate strings between source files and object files (many of which
     are read by both the linker and compiler) avoid re-unescaping; and
  2. Writers can use this cache to avoid re-escaping when we've already seen
     the escaped variant of the string during read.

The alternative would be a global cache, like the internment system, but I
did not find that to be appropriate here, since this is far less
fundamental and is much easier to compose.

DEV-11081
main
Mike Gerwitz 2021-11-12 13:59:14 -05:00
parent b1c0783c75
commit 27ba03b59b
13 changed files with 554 additions and 629 deletions

View File

@ -138,8 +138,8 @@ mod writer {
Writer as QuickXmlWriter,
};
use std::borrow::Cow;
use tamer::span::Span;
use tamer::xir::{writer::XmlWriter, Text};
use tamer::{span::Span, xir::DefaultEscaper};
const FRAGMENT: &str = r#"<fragment>
This is pretend fragment text. We need a lot of it.
@ -216,7 +216,7 @@ This is pretend fragment text. We need a lot of it.</fragment>
Token::Close(None, span),
]
.into_iter()
.write(&mut buf, Default::default())
.write(&mut buf, Default::default(), &DefaultEscaper::default())
.unwrap();
});
});
@ -253,7 +253,11 @@ This is pretend fragment text. We need a lot of it.</fragment>
bench.iter(|| {
(0..50).for_each(|_| {
Token::Text(Text::Escaped(frag), span)
.write(&mut buf, Default::default())
.write(
&mut buf,
Default::default(),
&DefaultEscaper::default(),
)
.unwrap();
});
});

View File

@ -25,12 +25,14 @@ use super::xmle::{
xir::lower_iter,
XmleSections,
};
use crate::asg::{Asg, DefaultAsg, IdentObject};
use crate::global;
use crate::obj::xmlo::{AsgBuilder, AsgBuilderState, XmloReader};
use crate::sym::SymbolId;
use crate::sym::{GlobalSymbolIntern, GlobalSymbolResolve};
use crate::xir::writer::XmlWriter;
use crate::{
asg::{Asg, DefaultAsg, IdentObject},
xir::DefaultEscaper,
};
use crate::{
fs::{
Filesystem, FsCanonicalizer, PathFile, VisitOnceFile,
@ -38,6 +40,10 @@ use crate::{
},
ld::xmle::Sections,
};
use crate::{
obj::xmlo::{AsgBuilder, AsgBuilderState, XmloReader},
xir::Escaper,
};
use fxhash::FxBuildHasher;
use petgraph_graphml::GraphMl;
use std::error::Error;
@ -54,11 +60,24 @@ type LinkerAsgBuilderState =
pub fn xmle(package_path: &str, output: &str) -> Result<(), Box<dyn Error>> {
let mut fs = VisitOnceFilesystem::new();
let mut depgraph = LinkerAsg::with_capacity(65536, 65536);
let escaper = {
#[cfg(feature = "wip-xmlo-xir-reader")]
{
DefaultEscaper::default()
}
#[cfg(not(feature = "wip-xmlo-xir-reader"))]
{
// The original POC linker did nothing with escape sequences,
// since it simply shuffles data around and re-outputs as XML.
crate::xir::NullEscaper::default()
}
};
let state = load_xmlo(
package_path,
&mut fs,
&mut depgraph,
&escaper,
AsgBuilderState::new(),
)?;
@ -105,6 +124,7 @@ pub fn xmle(package_path: &str, output: &str) -> Result<(), Box<dyn Error>> {
name.expect("missing root package name"),
relroot.expect("missing root package relroot"),
output,
&escaper,
)?;
Ok(())
@ -113,11 +133,13 @@ pub fn xmle(package_path: &str, output: &str) -> Result<(), Box<dyn Error>> {
pub fn graphml(package_path: &str, output: &str) -> Result<(), Box<dyn Error>> {
let mut fs = VisitOnceFilesystem::new();
let mut depgraph = LinkerAsg::with_capacity(65536, 65536);
let escaper = DefaultEscaper::default();
let _ = load_xmlo(
package_path,
&mut fs,
&mut depgraph,
&escaper,
AsgBuilderState::new(),
)?;
@ -161,10 +183,11 @@ pub fn graphml(package_path: &str, output: &str) -> Result<(), Box<dyn Error>> {
Ok(())
}
fn load_xmlo<'a, P: AsRef<Path>>(
fn load_xmlo<'a, P: AsRef<Path>, S: Escaper>(
path_str: P,
fs: &mut VisitOnceFilesystem<FsCanonicalizer, FxBuildHasher>,
depgraph: &mut LinkerAsg,
escaper: &S,
state: LinkerAsgBuilderState,
) -> Result<LinkerAsgBuilderState, Box<dyn Error>> {
let cfile: PathFile<BufReader<fs::File>> = match fs.open(path_str)? {
@ -186,7 +209,7 @@ fn load_xmlo<'a, P: AsRef<Path>>(
use crate::iter::into_iter_while_ok;
use crate::xir::reader::XmlXirReader;
into_iter_while_ok(XmlXirReader::from(file), |toks| {
into_iter_while_ok(XmlXirReader::new(file, escaper), |toks| {
let xmlo: XmloReader<_> = toks.into();
depgraph.import_xmlo(xmlo, state)
})??
@ -204,22 +227,27 @@ fn load_xmlo<'a, P: AsRef<Path>>(
path_buf.push(str);
path_buf.set_extension("xmlo");
state = load_xmlo(path_buf, fs, depgraph, state)?;
state = load_xmlo(path_buf, fs, depgraph, escaper, state)?;
}
Ok(state)
}
fn output_xmle<'a, S: XmleSections<'a>>(
sorted: S,
fn output_xmle<'a, X: XmleSections<'a>, S: Escaper>(
sorted: X,
name: SymbolId,
relroot: SymbolId,
output: &str,
escaper: &S,
) -> Result<(), Box<dyn Error>> {
let file = fs::File::create(output)?;
let mut buf = BufWriter::new(file);
lower_iter(sorted, name, relroot).write(&mut buf, Default::default())?;
lower_iter(sorted, name, relroot).write(
&mut buf,
Default::default(),
escaper,
)?;
buf.flush()?;
Ok(())

View File

@ -35,7 +35,7 @@ use crate::{
sym::{st::*, SymbolId},
xir::{
iter::{elem_wrap, ElemWrapIter},
AttrValue, QName, Text, Token, XirString,
QName, Text, Token,
},
};
use arrayvec::ArrayVec;
@ -75,28 +75,21 @@ type HeaderIter = array::IntoIter<Token, HEADER_SIZE>;
/// and its immediate child.
#[inline]
fn header(pkg_name: SymbolId, relroot: SymbolId) -> HeaderIter {
// TODO: Introduce newtypes so that we do not have to make unsafe
// assumptions.
let pkg_name_val =
AttrValue::from(unsafe { XirString::assume_fixed(pkg_name) });
let relroot_val =
AttrValue::from(unsafe { XirString::assume_fixed(relroot) });
[
Token::AttrName(QN_XMLNS, LSPAN),
Token::AttrValue(AttrValue::st_uri(URI_LV_RATER), LSPAN),
Token::AttrValue(raw::URI_LV_RATER, LSPAN),
Token::AttrName(QN_XMLNS_PREPROC, LSPAN),
Token::AttrValue(AttrValue::st_uri(URI_LV_PREPROC), LSPAN),
Token::AttrValue(raw::URI_LV_PREPROC, LSPAN),
Token::AttrName(QN_XMLNS_L, LSPAN),
Token::AttrValue(AttrValue::st_uri(URI_LV_LINKER), LSPAN),
Token::AttrValue(raw::URI_LV_LINKER, LSPAN),
Token::AttrName(QN_TITLE, LSPAN),
Token::AttrValue(pkg_name_val, LSPAN),
Token::AttrValue(pkg_name, LSPAN),
Token::AttrName(QN_PROGRAM, LSPAN),
Token::AttrValue(AttrValue::st_cid(L_TRUE), LSPAN),
Token::AttrValue(raw::L_TRUE, LSPAN),
Token::AttrName(QN_NAME, LSPAN),
Token::AttrValue(pkg_name_val, LSPAN),
Token::AttrValue(pkg_name, LSPAN),
Token::AttrName(QN_UUROOTPATH, LSPAN),
Token::AttrValue(relroot_val, LSPAN),
Token::AttrValue(relroot, LSPAN),
]
.into_iter()
}
@ -124,7 +117,7 @@ struct DepListIter<'a> {
/// Constant-size [`Token`] buffer used as a stack.
toks: ArrayVec<Token, DEP_TOK_SIZE>,
/// Relative path to project root.
relroot: AttrValue,
relroot: SymbolId,
}
impl<'a> DepListIter<'a> {
@ -133,11 +126,7 @@ impl<'a> DepListIter<'a> {
Self {
iter,
toks: ArrayVec::new(),
// TODO: we cannot trust that an arbitrary symbol is escaped; this
// needs better typing, along with other things.
relroot: AttrValue::from(unsafe {
XirString::assume_fixed(relroot)
}),
relroot,
}
}
@ -172,10 +161,7 @@ impl<'a> DepListIter<'a> {
if let Some(pkg_name) = src.pkg_name {
// TODO: Introduce newtypes so that we do not have to make unsafe
// assumptions.
let pkg_name_val =
AttrValue::from(unsafe { XirString::assume_fixed(pkg_name) });
self.toks.push(Token::AttrValue(pkg_name_val, LSPAN));
self.toks.push(Token::AttrValue(pkg_name, LSPAN));
self.toks.push(Token::AttrValueFragment(self.relroot, LSPAN));
self.toks.push(Token::AttrName(QN_SRC, LSPAN));
}
@ -199,9 +185,7 @@ impl<'a> DepListIter<'a> {
#[inline]
fn toks_push_attr(&mut self, name: QName, value: Option<SymbolId>) {
if let Some(val) = value {
let attr_val = AttrValue::from(val);
self.toks.push(Token::AttrValue(attr_val, LSPAN));
self.toks.push(Token::AttrValue(val, LSPAN));
self.toks.push(Token::AttrName(name, LSPAN));
}
}
@ -284,11 +268,7 @@ impl MapFromsIter {
self.iter.next().and_then(|from| {
self.toks.push(Token::Close(None, LSPAN));
// TODO
let from_val =
AttrValue::from(unsafe { XirString::assume_fixed(from) });
self.toks.push(Token::AttrValue(from_val, LSPAN));
self.toks.push(Token::AttrValue(from, LSPAN));
self.toks.push(Token::AttrName(QN_NAME, LSPAN));
Some(Token::Open(QN_L_FROM, LSPAN))

View File

@ -256,12 +256,12 @@ fn test_writes_deps() -> TestResult {
assert_eq!(
attrs.find(QN_NAME).and_then(|a| a.value_atom()),
Some(AttrValue::from(ident.name())),
Some(ident.name()),
);
assert_eq!(
attrs.find(QN_TYPE).and_then(|a| a.value_atom()),
Some(AttrValue::from(ident.kind().unwrap().as_sym()))
Some(ident.kind().unwrap().as_sym())
);
let generated = attrs.find(QN_GENERATED).and_then(|a| a.value_atom());
@ -270,17 +270,17 @@ fn test_writes_deps() -> TestResult {
generated: true, ..
}) = ident.src()
{
assert_eq!(generated, Some(AttrValue::from("true".intern())));
assert_eq!(generated, Some("true".intern()));
} else {
assert_eq!(generated, None);
}
if let Some(Source { parent, .. }) = ident.src() {
assert_attr!(attrs, QN_PARENT, parent.map(|x| AttrValue::from(x)),);
assert_attr!(attrs, QN_PARENT, *parent,);
}
if let Some(Source { yields, .. }) = ident.src() {
assert_attr!(attrs, QN_YIELDS, yields.map(|x| AttrValue::from(x)),);
assert_attr!(attrs, QN_YIELDS, *yields,);
}
if let Some(Source {
@ -311,10 +311,7 @@ fn test_writes_deps() -> TestResult {
Some(Attr::Extensible(parts)) => {
assert_eq!(
parts.value_fragments(),
&vec![
(AttrValue::from(relroot), LSPAN),
(AttrValue::from(*pkg_name), LSPAN),
]
&vec![(relroot, LSPAN), (*pkg_name, LSPAN),]
);
}
invalid => panic!("unexpected desc: {:?}", invalid),
@ -327,7 +324,7 @@ fn test_writes_deps() -> TestResult {
assert_attr!(
attrs,
QN_DIM,
Some(AttrValue::from(Into::<SymbolId>::into(*dim))),
Some(Into::<SymbolId>::into(*dim)),
"invalid {:?} @dim",
ident.kind()
);
@ -341,7 +338,7 @@ fn test_writes_deps() -> TestResult {
assert_attr!(
attrs,
QN_DIM,
Some(AttrValue::from(Into::<SymbolId>::into(*dim))),
Some((*dim).into()),
"invalid {:?} @dim",
ident.kind()
);
@ -349,7 +346,7 @@ fn test_writes_deps() -> TestResult {
assert_attr!(
attrs,
QN_DTYPE,
Some(AttrValue::from(Into::<SymbolId>::into(*dtype))),
Some((*dtype).into()),
"invalid {:?} @dtype",
ident.kind()
);
@ -359,7 +356,7 @@ fn test_writes_deps() -> TestResult {
assert_attr!(
attrs,
QN_DTYPE,
Some(AttrValue::from(Into::<SymbolId>::into(*dtype))),
Some((*dtype).into()),
"invalid {:?} @dim",
ident.kind()
);
@ -438,8 +435,8 @@ fn test_writes_map_froms() -> TestResult {
);
});
assert!(found.contains(&AttrValue::from("froma".intern())));
assert!(found.contains(&AttrValue::from("fromb".intern())));
assert!(found.contains(&"froma".intern()));
assert!(found.contains(&"fromb".intern()));
Ok(())
}

View File

@ -50,17 +50,6 @@
//!# }
//! ```
//!
//! However,
//! certain elements cannot fully parse on their own because require
//! important contextual information,
//! such as [`AttrValue`],
//! which requires knowing whether the provided value is escaped.
//! It is important that the caller is diligent in making the proper
//! determination in these cases,
//! otherwise it could result in situations ranging from invalid
//! compiler output to security vulnerabilities
//! (via XML injection).
//!
//! To parse an entire XML document,
//! see [`reader`].
@ -68,19 +57,17 @@ use crate::span::Span;
use crate::sym::{
st_as_sym, CIdentStaticSymbolId, GlobalSymbolIntern,
GlobalSymbolInternBytes, StaticSymbolId, SymbolId, TameIdentStaticSymbolId,
UriStaticSymbolId,
};
use memchr::memchr;
use std::convert::{TryFrom, TryInto};
use std::fmt::Display;
use std::hash::Hash;
use std::ops::Deref;
mod error;
pub use error::Error;
mod string;
pub use string::XirString;
mod escape;
pub use escape::{DefaultEscaper, Escaper, NullEscaper};
pub mod iter;
pub mod pred;
@ -494,47 +481,6 @@ pub enum Text {
Escaped(SymbolId),
}
/// Represents an attribute value and its escaped contents.
///
/// Being explicit about the state of escaping allows us to skip checks when
/// we know that the generated text could not possibly require escaping.
/// This does, however, put the onus on the caller to ensure that they got
/// the escaping status correct.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct AttrValue(XirString);
impl AttrValue {
/// Construct a constant escaped attribute from a static C-style symbol.
pub const fn st_cid(sym: CIdentStaticSymbolId) -> Self {
Self(XirString::st_cid(sym))
}
/// Construct a constant escaped attribute from a static URI symbol.
///
/// URIs are expected _not_ to contain quotes.
pub const fn st_uri(sym: UriStaticSymbolId) -> Self {
Self(XirString::st_uri(sym))
}
}
impl<T: Into<XirString>> From<T> for AttrValue {
fn from(s: T) -> Self {
Self(s.into())
}
}
impl Into<SymbolId> for AttrValue {
fn into(self) -> SymbolId {
self.0.into()
}
}
impl Display for AttrValue {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
/// Lightly-structured XML tokens with associated [`Span`]s.
///
/// This is a streamable IR for XML.
@ -575,7 +521,7 @@ pub enum Token {
AttrName(QName, Span),
/// Element attribute value.
AttrValue(AttrValue, Span),
AttrValue(SymbolId, Span),
/// A portion of an element attribute value.
///
@ -586,7 +532,7 @@ pub enum Token {
/// Since each fragment contains a span,
/// this also potentially gives higher resolution for the origin of
/// components of generated attribute values.
AttrValueFragment(AttrValue, Span),
AttrValueFragment(SymbolId, Span),
/// A delimiter indicating that attribute processing has ended and the
/// next token will be either a child node or [`Token::Close`].

View File

@ -0,0 +1,188 @@
// XIR string escaping and unescaping
//
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//! Escaping and unescaping for writers and readers respectively.
//!
//! An [`Escaper`] is required by XIR readers and writers.
//! An escaper may perform caching to avoid unnecessary work,
//! so it is advantageous to provide the _same_ instance to all readers
//! and writers.
//! [`Escaper`] methods use interior mutability to facilitate this,
//! since TAMER streams lowering operations where possible,
//! meaning that multiple readers and writers will require references
//! to the [`Escaper`].
//!
//! Safety
//! ======
//! The purpose of this type is to provide safety against XML injection by
//! encapsulating all responsibility within a single object.
//! The idea is simple:
//! a [`SymbolId`] _always_ represents an unescaped string.
//! This prevents, primarily,
//!
//! 1. XML injection (via lack of escaping); and
//! 2. Erroneous multiple escape/unescape.
//!
//! This module is the _only_ part of the system that has access to raw,
//! escaped values.
//! Outside of this module,
//! it is assumed that the rest of the system is working with _unescaped_
//! values---afterall,
//! why would other parts of the system not dealing with XML directly
//! take it upon themselves to deal with XML directly?
//! If we permitted retrieving raw escaped [`SymbolId`]s,
//! then we run the risk of that value being used to construct a XIR
//! stream and be subsequently double-encoded upon writing.
use crate::sym::{
GlobalSymbolInternBytes, GlobalSymbolInternUnchecked, GlobalSymbolResolve,
SymbolId,
};
use std::borrow::Cow;
use super::Error;
/// XIR escaper and unescaper.
///
/// Escapers are responsible for parsing XML escape sequences as necessary
/// on read,
/// and properly escaping characters on write.
/// This is the only part of the system defending XIR against XML
/// injection.
///
/// Escapers must use interior mutability for any internal state
/// (e.g. caching),
/// since multiple readers and writers will require references.
pub trait Escaper: Default {
/// Escape raw bytes such that they become suitable for writing into an
/// XML document as text.
///
/// This value must be escaped such that subsequence unescaping
/// (using [`unescape_bytes`](Escaper::unescape_bytes))
/// will result in the same value.
fn escape_bytes(value: &[u8]) -> Cow<[u8]>;
/// Unescape raw bytes such that any relevant escape sequences are
/// parsed into their text representation.
fn unescape_bytes(value: &[u8]) -> Result<Cow<[u8]>, Error>;
/// Escape the given symbol and produce a [`SymbolId`] representing
/// the escaped value suitable for writing.
fn escape(&self, sym: SymbolId) -> SymbolId {
match Self::escape_bytes(sym.lookup_str().as_bytes()) {
// We got back what we sent in,
// so this value is fixed.
Cow::Borrowed(_) => sym,
// The value changed,
// so we must allocate a new symbol.
// SAFETY: The unescaped symbol is valid UTF-8 unless it was
// unsafely allocated.
// Given that escaping does not introduce any invalid UTF-8
// sequences
// (as is trivially verified by reading its implementation),
// we can skip the UTF-8 check.
Cow::Owned(esc) => unsafe { esc[..].intern_utf8_unchecked() },
}
}
/// Unescape the provided raw value and return a [`SymbolId`]
/// representing the unescaped value.
fn unescape_intern<'a>(
&self,
escaped: &'a [u8],
) -> Result<SymbolId, Error> {
Ok(match Self::unescape_bytes(escaped)? {
// We got back what we sent in,
// so this value is fixed.
Cow::Borrowed(orig) => {
debug_assert!(orig == escaped);
orig.intern_utf8()?
}
// The value was rewritten,
// meaning that the original was escaped.
// We can't assume that it's valid UTF-8.
Cow::Owned(unesc) => unesc.intern_utf8()?,
})
}
}
/// Escape and unescape using [`quick_xml`].
#[derive(Debug, Clone, Copy, Default)]
pub struct QuickXmlEscaper {}
impl Escaper for QuickXmlEscaper {
#[inline]
fn escape_bytes(value: &[u8]) -> Cow<[u8]> {
quick_xml::escape::escape(value)
}
#[inline]
fn unescape_bytes(value: &[u8]) -> Result<Cow<[u8]>, Error> {
// For some reason,
// quick-xml has made EscapeError explicitly private to the crate,
// and so it is opaque to us.
// They have, however,
// implemented `From<EscapeError> for Error`,
// which we will use here.
Ok(quick_xml::escape::unescape(value)
.map_err(quick_xml::Error::from)?)
}
}
/// Perform no escaping or unescaping.
///
/// _This should be removed after development of the XIR-based readers!_
#[cfg(not(feature = "wip-xmlo-xir-reader"))]
#[derive(Debug, Clone, Copy, Default)]
pub struct NullEscaper {}
#[cfg(not(feature = "wip-xmlo-xir-reader"))]
impl Escaper for NullEscaper {
#[inline]
fn escape_bytes(value: &[u8]) -> Cow<[u8]> {
Cow::Borrowed(value)
}
#[inline]
fn unescape_bytes(_value: &[u8]) -> Result<Cow<[u8]>, Error> {
panic!("NullEscaper should not be used for unescaping")
}
}
pub type DefaultEscaper = QuickXmlEscaper;
#[cfg(test)]
mod test {
use super::*;
use crate::sym::GlobalSymbolIntern;
// Simple sanity check to ensure that the default escaper actually does
// some sort of escaping.
#[test]
fn default_escaper_escapes() {
let sut = DefaultEscaper::default();
assert_eq!(
"foo&lt;bar".intern(),
sut.escape("foo<bar".intern()).into(),
);
}
}

View File

@ -21,7 +21,7 @@
//!
//! This uses [`quick_xml`] as the parser.
use super::{Error, Token, XirString};
use super::{DefaultEscaper, Error, Escaper, Token};
use crate::{span::DUMMY_SPAN, sym::GlobalSymbolInternBytes, xir::Text};
use quick_xml::{
self,
@ -46,7 +46,11 @@ pub type Result<T> = result::Result<T, Error>;
///
/// [`None`] is returned only on EOF,
/// not on error.
pub struct XmlXirReader<B: BufRead> {
pub struct XmlXirReader<'s, B, S = DefaultEscaper>
where
B: BufRead,
S: Escaper,
{
/// Inner parser.
reader: quick_xml::Reader<B>,
@ -61,10 +65,13 @@ pub struct XmlXirReader<B: BufRead> {
/// after which [`XmlXirReader::refill_buf`] requests another token
/// from `reader`.
tokbuf: VecDeque<Token>,
/// System for unescaping string data.
escaper: &'s S,
}
impl<B: BufRead> XmlXirReader<B> {
pub fn new(reader: B) -> Self {
impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
pub fn new(reader: B, escaper: &'s S) -> Self {
let mut reader = quick_xml::Reader::from_reader(reader);
// XIR must support mismatched tags so that we are able to represent
@ -79,6 +86,8 @@ impl<B: BufRead> XmlXirReader<B> {
// but [`Token`]s are small enough that it likely does not
// matter much.
tokbuf: VecDeque::with_capacity(32),
escaper,
}
}
@ -99,21 +108,25 @@ impl<B: BufRead> XmlXirReader<B> {
Ok(ev) => match ev {
QuickXmlEvent::Empty(ele) => Some(
Self::parse_element_open(&mut self.tokbuf, ele).and_then(
|open| {
// Tag is self-closing, but this does not yet
// handle whitespace before the `/`.
self.tokbuf
.push_front(Token::Close(None, DUMMY_SPAN));
Self::parse_element_open(
&self.escaper,
&mut self.tokbuf,
ele,
)
.and_then(|open| {
// Tag is self-closing, but this does not yet
// handle whitespace before the `/`.
self.tokbuf.push_front(Token::Close(None, DUMMY_SPAN));
Ok(open)
},
),
Ok(open)
}),
),
QuickXmlEvent::Start(ele) => {
Some(Self::parse_element_open(&mut self.tokbuf, ele))
}
QuickXmlEvent::Start(ele) => Some(Self::parse_element_open(
&self.escaper,
&mut self.tokbuf,
ele,
)),
QuickXmlEvent::End(ele) => {
Some(ele.name().try_into().map_err(Error::from).and_then(
@ -156,6 +169,7 @@ impl<B: BufRead> XmlXirReader<B> {
/// buffer,
/// since the intent is to provide that token immediately.
fn parse_element_open(
escaper: &'s S,
tokbuf: &mut VecDeque<Token>,
ele: BytesStart,
) -> Result<Token> {
@ -163,7 +177,7 @@ impl<B: BufRead> XmlXirReader<B> {
.try_into()
.map_err(Error::from)
.and_then(|qname| {
Self::parse_attrs(tokbuf, ele.attributes())?;
Self::parse_attrs(escaper, tokbuf, ele.attributes())?;
// The first token will be immediately returned
// via the Iterator.
@ -178,6 +192,7 @@ impl<B: BufRead> XmlXirReader<B> {
/// This does not yet handle whitespace between attributes,
/// or around `=`.
fn parse_attrs<'a>(
escaper: &'s S,
tokbuf: &mut VecDeque<Token>,
mut attrs: Attributes<'a>,
) -> Result<()> {
@ -199,8 +214,7 @@ impl<B: BufRead> XmlXirReader<B> {
// that's okay as long as we can read it again,
// but we probably should still throw an error if we
// encounter such a situation.
let value =
XirString::from_escaped_raw(attr.value.as_ref())?.into();
let value = escaper.unescape_intern(attr.value.as_ref())?.into();
tokbuf.push_front(Token::AttrName(name, DUMMY_SPAN));
tokbuf.push_front(Token::AttrValue(value, DUMMY_SPAN));
@ -215,7 +229,11 @@ impl<B: BufRead> XmlXirReader<B> {
}
}
impl<B: BufRead> Iterator for XmlXirReader<B> {
impl<'s, B, S> Iterator for XmlXirReader<'s, B, S>
where
B: BufRead,
S: Escaper,
{
type Item = Result<Token>;
/// Produce the next XIR [`Token`] from the input.
@ -230,11 +248,5 @@ impl<B: BufRead> Iterator for XmlXirReader<B> {
}
}
impl<B: BufRead> From<B> for XmlXirReader<B> {
fn from(reader: B) -> Self {
Self::new(reader)
}
}
#[cfg(test)]
mod test;

View File

@ -17,12 +17,14 @@
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
use std::borrow::Cow;
use super::*;
use crate::sym::GlobalSymbolIntern;
use crate::{
convert::ExpectInto,
span::DUMMY_SPAN,
xir::{AttrValue, Text, Token},
xir::{Error, Text, Token},
};
/// These tests use [`quick_xml`] directly,
@ -41,7 +43,24 @@ use crate::{
/// and by relying on certain parsing behavior to eliminate
/// redundant checks.
type Sut<B> = XmlXirReader<B>;
type Sut<'a, B, S> = XmlXirReader<'a, B, S>;
#[derive(Debug, Default)]
struct MockEscaper {}
// Simply adds ":UNESC" as a suffix to the provided byte slice.
impl Escaper for MockEscaper {
fn escape_bytes(_: &[u8]) -> Cow<[u8]> {
unreachable!("Reader should not be escaping!")
}
fn unescape_bytes(value: &[u8]) -> result::Result<Cow<[u8]>, Error> {
let mut unesc = value.to_owned();
unesc.extend_from_slice(b":UNESC");
Ok(Cow::Owned(unesc))
}
}
/// A byte that will be invalid provided that there is either no following
/// UTF-8 byte,
@ -55,9 +74,20 @@ const INVALID_UTF8_BYTE: u8 = 0b11000000u8;
const INVALID_STR: &str =
unsafe { std::str::from_utf8_unchecked(&[INVALID_UTF8_BYTE]) };
macro_rules! new_sut {
($sut:ident = $data:expr) => {
new_sut!(b $sut = $data.as_bytes())
};
(b $sut:ident = $data:expr) => {
let escaper = MockEscaper::default();
let $sut = Sut::new($data, &escaper);
};
}
#[test]
fn empty_node_without_prefix_or_attributes() {
let sut = Sut::new("<empty-node />".as_bytes());
new_sut!(sut = "<empty-node />");
let result = sut.collect::<Result<Vec<_>>>();
@ -74,7 +104,7 @@ fn empty_node_without_prefix_or_attributes() {
// Resolving namespaces is not the concern of XIR.
#[test]
fn does_not_resolve_xmlns() {
let sut = Sut::new(r#"<no-ns xmlns="noresolve" />"#.as_bytes());
new_sut!(sut = r#"<no-ns xmlns="noresolve" />"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -84,7 +114,7 @@ fn does_not_resolve_xmlns() {
Token::Open("no-ns".unwrap_into(), DUMMY_SPAN),
// Since we didn't parse @xmlns, it's still an attribute.
Token::AttrName("xmlns".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("noresolve".intern()), DUMMY_SPAN),
Token::AttrValue("noresolve:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
@ -94,7 +124,7 @@ fn does_not_resolve_xmlns() {
// Resolving namespaces is not the concern of XIR.
#[test]
fn empty_node_with_prefix_without_attributes_unresolved() {
let sut = Sut::new(r#"<x:empty-node xmlns:x="noresolve" />"#.as_bytes());
new_sut!(sut = r#"<x:empty-node xmlns:x="noresolve" />"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -104,7 +134,7 @@ fn empty_node_with_prefix_without_attributes_unresolved() {
vec![
Token::Open(("x", "empty-node").unwrap_into(), DUMMY_SPAN),
Token::AttrName(("xmlns", "x").unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("noresolve".intern()), DUMMY_SPAN),
Token::AttrValue("noresolve:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
@ -115,7 +145,7 @@ fn empty_node_with_prefix_without_attributes_unresolved() {
#[test]
fn prefix_with_empty_local_name_invalid_qname() {
// No local name (trailing colon).
let sut = Sut::new(r#"<x: xmlns:x="testns" />"#.as_bytes());
new_sut!(sut = r#"<x: xmlns:x="testns" />"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -130,7 +160,7 @@ fn prefix_with_empty_local_name_invalid_qname() {
// The order of attributes must be retained.
#[test]
fn multiple_attrs_ordered() {
let sut = Sut::new(r#"<ele foo="a" bar="b" b:baz="c" />"#.as_bytes());
new_sut!(sut = r#"<ele foo="a" bar="b" b:baz="c" />"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -139,11 +169,11 @@ fn multiple_attrs_ordered() {
vec![
Token::Open("ele".unwrap_into(), DUMMY_SPAN),
Token::AttrName("foo".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("a".intern()), DUMMY_SPAN),
Token::AttrValue("a:UNESC".intern(), DUMMY_SPAN),
Token::AttrName("bar".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("b".intern()), DUMMY_SPAN),
Token::AttrValue("b:UNESC".intern(), DUMMY_SPAN),
Token::AttrName(("b", "baz").unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("c".intern()), DUMMY_SPAN),
Token::AttrValue("c:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
@ -154,7 +184,7 @@ fn multiple_attrs_ordered() {
// need to allow it to support e.g. recovery, code formatting, and LSPs.
#[test]
fn permits_duplicate_attrs() {
let sut = Sut::new(r#"<dup attr="a" attr="b" />"#.as_bytes());
new_sut!(sut = r#"<dup attr="a" attr="b" />"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -163,9 +193,9 @@ fn permits_duplicate_attrs() {
vec![
Token::Open("dup".unwrap_into(), DUMMY_SPAN),
Token::AttrName("attr".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("a".intern()), DUMMY_SPAN),
Token::AttrValue("a:UNESC".intern(), DUMMY_SPAN),
Token::AttrName("attr".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("b".intern()), DUMMY_SPAN),
Token::AttrValue("b:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
],
@ -174,7 +204,7 @@ fn permits_duplicate_attrs() {
#[test]
fn child_node_self_closing() {
let sut = Sut::new(r#"<root><child /></root>"#.as_bytes());
new_sut!(sut = r#"<root><child /></root>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -193,7 +223,7 @@ fn child_node_self_closing() {
#[test]
fn sibling_nodes() {
let sut = Sut::new(r#"<root><child /><child /></root>"#.as_bytes());
new_sut!(sut = r#"<root><child /><child /></root>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -215,7 +245,7 @@ fn sibling_nodes() {
#[test]
fn child_node_with_attrs() {
let sut = Sut::new(r#"<root><child foo="bar" /></root>"#.as_bytes());
new_sut!(sut = r#"<root><child foo="bar" /></root>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -226,7 +256,7 @@ fn child_node_with_attrs() {
Token::AttrEnd,
Token::Open("child".unwrap_into(), DUMMY_SPAN),
Token::AttrName("foo".unwrap_into(), DUMMY_SPAN),
Token::AttrValue(AttrValue::from("bar".intern()), DUMMY_SPAN),
Token::AttrValue("bar:UNESC".intern(), DUMMY_SPAN),
Token::AttrEnd,
Token::Close(None, DUMMY_SPAN),
Token::Close(Some("root".unwrap_into()), DUMMY_SPAN),
@ -236,7 +266,7 @@ fn child_node_with_attrs() {
#[test]
fn child_text() {
let sut = Sut::new(r#"<text>foo bar</text>"#.as_bytes());
new_sut!(sut = r#"<text>foo bar</text>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -253,7 +283,7 @@ fn child_text() {
#[test]
fn mixed_child_content() {
let sut = Sut::new(r#"<text>foo<em>bar</em></text>"#.as_bytes());
new_sut!(sut = r#"<text>foo<em>bar</em></text>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -277,13 +307,12 @@ fn mixed_child_content() {
// opening and closing tags of the root node.
#[test]
fn mixed_child_content_with_newlines() {
let sut = Sut::new(
r#"
new_sut!(
sut = r#"
<root>
<child />
</root>
"#
.as_bytes(),
);
let result = sut.collect::<Result<Vec<_>>>();
@ -307,7 +336,7 @@ fn mixed_child_content_with_newlines() {
#[test]
fn child_cdata() {
let sut = Sut::new(r#"<cd><![CDATA[<foo />]]></cd>"#.as_bytes());
new_sut!(sut = r#"<cd><![CDATA[<foo />]]></cd>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -325,7 +354,7 @@ fn child_cdata() {
#[test]
fn mixed_child_text_and_cdata() {
let sut = Sut::new(r#"<cd>foo<bar/><![CDATA[<baz/>]]></cd>"#.as_bytes());
new_sut!(sut = r#"<cd>foo<bar/><![CDATA[<baz/>]]></cd>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -347,7 +376,7 @@ fn mixed_child_text_and_cdata() {
#[test]
fn comment() {
let sut = Sut::new(r#"<!--root--><root><!--<child>--></root>"#.as_bytes());
new_sut!(sut = r#"<!--root--><root><!--<child>--></root>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -365,12 +394,11 @@ fn comment() {
#[test]
fn comment_multiline() {
let sut = Sut::new(
r#"<mult><!--comment
new_sut!(
sut = r#"<mult><!--comment
on multiple
lines-->
</mult>"#
.as_bytes(),
);
let result = sut.collect::<Result<Vec<_>>>();
@ -393,7 +421,7 @@ lines-->
// XIRT handles mismatch errors; XIR must explicitly support them.
#[test]
fn permits_mismatched_tags() {
let sut = Sut::new(r#"<root><child /></mismatch>"#.as_bytes());
new_sut!(sut = r#"<root><child /></mismatch>"#);
let result = sut.collect::<Result<Vec<_>>>();
@ -414,7 +442,7 @@ fn permits_mismatched_tags() {
#[test]
fn node_name_invalid_utf8() {
let bytes: &[u8] = &[b'<', INVALID_UTF8_BYTE, b'/', b'>'];
let sut = Sut::new(bytes);
new_sut!(b sut = bytes);
let result = sut.collect::<Result<Vec<_>>>();
@ -434,7 +462,7 @@ fn attr_name_invalid_utf8() {
s.push_str(INVALID_STR);
s.push_str(r#"="value"/>"#);
let sut = Sut::new(s.as_bytes());
new_sut!(sut = s);
let result = sut.collect::<Result<Vec<_>>>();
@ -454,14 +482,28 @@ fn attr_value_invalid_utf8() {
s.push_str(INVALID_STR);
s.push_str(r#""/>"#);
let sut = Sut::new(s.as_bytes());
new_sut!(sut = s);
let result = sut.collect::<Result<Vec<_>>>();
match result {
Ok(_) => panic!("expected failure"),
Err(Error::InvalidUtf8(_, bytes)) => {
assert_eq!(bytes, &[b'b', b'a', b'd', INVALID_UTF8_BYTE]);
assert_eq!(
bytes,
&[
b'b',
b'a',
b'd',
INVALID_UTF8_BYTE,
b':',
b'U',
b'N',
b'E',
b'S',
b'C'
]
);
}
_ => panic!("unexpected failure"),
}

View File

@ -1,321 +0,0 @@
// XIR string and escape context
//
// Copyright (C) 2014-2021 Ryan Specialty Group, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//! [`XirString`] and escape context.
//!
//! Safety
//! ======
//! The purpose of this type is to provide safety against XML injection by
//! encapsulating all responsibility within a single object.
//! The idea is simple:
//! if you have a safely constructed [`XirString`],
//! it can be safely used in any context without worrying about these
//! critical problems:
//!
//! 1. XML injection (via lack of escaping); and
//! 2. Erroneous multiple escape/unescape.
//!
//! Both of these problems are solved by ensuring that the proper context
//! for a given [`SymbolId`] is always maintained---a
//! symbol is either valid to be written or not.
//! Similarly,
//! we must know whether a symbol is escaped or not to know whether it
//! ought to be unescaped while reading.
//!
//! This context also ensures that we will not erroneously unescape or
//! re-escape values at multiple points in a program,
//! leading to incorrect data at best and vulnerabilities at worst.
//!
//! To ensure this safety,
//! it is important that types understand how to convert between
//! one-another in well-defined ways.
//! It should not be possible "just assume" that a value has already been
//! escaped.
//! Given that,
//! the constructors for this type are private to this module;
//! readers know the escape status and can produce the proper type and
//! internal types know how to translate between one-another,
//! but anything else making those assumptions is considered unsafe.
//!
//! Outside of this module,
//! it is assumed that the rest of the system is working with _unescaped_
//! values---afterall,
//! why would other parts of the system not dealing with XML directly
//! take it upon themselves to deal with XML directly?
//! Given that,
//! other modules can only read unescaped values and construct
//! [`XirString`] using unescaped values.
//! If we permitted retrieving raw escaped [`SymbolId`]s,
//! then we could construct from it another [`XirString`] that is
//! considered to be unescaped,
//! which would result in a double-escape if it were read using
//! [`XirString::escaped`].
use crate::sym::{
CIdentStaticSymbolId, GlobalSymbolInternBytes, GlobalSymbolInternUnchecked,
GlobalSymbolResolve, SymbolId, UriStaticSymbolId,
};
use std::{
borrow::Cow,
fmt::Display,
hash::{Hash, Hasher},
marker::PhantomData,
};
use super::Error;
/// An XML string that requires escaping before writing.
///
/// This type must be used in contexts where writing to an XML document is
/// not safe without proper escaping,
/// and where reading may require unescaping.
///
#[derive(Debug, Clone, Copy)]
pub struct XirString<S: XirStringEscaper = DefaultXirStringEscaper> {
unescaped: SymbolId,
escaped: Option<SymbolId>,
_escaper: PhantomData<S>,
}
// Since we implement Copy,
// ensure this size meets our expectations both as a sanity check and to
// ensure that attention is brought to this if it ever changes.
const_assert!(std::mem::size_of::<XirString>() <= std::mem::size_of::<usize>());
// A note about this type:
// Both fields are optional,
// but it is not valid to have both be `None`.
// To ensure that this is not possible,
// (a) the fields must remain private;
// (b) all constructors must initialize at least one of the fields; and
// (c) mutation must reconstruct using those constructors.
// This makes it possible to prove that the invariant always holds.
impl<S: XirStringEscaper> XirString<S> {
pub(super) fn from_escaped_raw(escaped: &[u8]) -> Result<Self, Error> {
let esc_sym = escaped.intern_utf8()?;
Ok(Self {
escaped: Some(esc_sym),
unescaped: match S::unescape_bytes(escaped)? {
// We got back what we sent in,
// so this value is fixed.
Cow::Borrowed(orig) => {
debug_assert!(orig == escaped);
esc_sym
}
// The value was rewritten,
// meaning that the original was escaped.
// We can't assume that it's valid UTF-8.
Cow::Owned(unesc) => unesc.intern_utf8()?,
},
_escaper: PhantomData,
})
}
pub const fn new_unescaped(sym: SymbolId) -> Self {
Self {
escaped: None,
unescaped: sym,
_escaper: PhantomData,
}
}
const fn new_fixed(sym: SymbolId) -> Self {
Self {
escaped: Some(sym),
unescaped: sym,
_escaper: PhantomData,
}
}
pub const unsafe fn assume_fixed(sym: SymbolId) -> Self {
Self::new_fixed(sym)
}
/// Construct a constant escaped attribute from a static C-style symbol.
pub const fn st_cid(sym: CIdentStaticSymbolId) -> Self {
Self::new_fixed(sym.as_sym())
}
/// Construct a constant escaped attribute from a static URI symbol.
///
/// URIs are expected _not_ to contain quotes.
pub const fn st_uri(sym: UriStaticSymbolId) -> Self {
Self::new_fixed(sym.as_sym())
}
// TODO: This unnecessarily allocates a symbol that'll just be written
// and not needed thereafter.
#[inline]
pub(super) fn into_escaped(self) -> SymbolId {
self.escaped.unwrap_or_else(|| S::escape(self.unescaped))
}