// XIR string escaping and unescaping // // Copyright (C) 2014-2021 Ryan Specialty Group, LLC. // // This file is part of TAME. // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . //! Escaping and unescaping for writers and readers respectively. //! //! An [`Escaper`] is required by XIR readers and writers. //! An escaper may perform caching to avoid unnecessary work, //! so it is advantageous to provide the _same_ instance to all readers //! and writers. //! [`Escaper`] methods use interior mutability to facilitate this, //! since TAMER streams lowering operations where possible, //! meaning that multiple readers and writers will require references //! to the [`Escaper`]. //! //! Safety //! ====== //! The purpose of this type is to provide safety against XML injection by //! encapsulating all responsibility within a single object. //! The idea is simple: //! a [`SymbolId`] _always_ represents an unescaped string. //! This prevents, primarily, //! //! 1. XML injection (via lack of escaping); and //! 2. Erroneous multiple escape/unescape. //! //! This module is the _only_ part of the system that has access to raw, //! escaped values. //! Outside of this module, //! it is assumed that the rest of the system is working with _unescaped_ //! values---afterall, //! why would other parts of the system not dealing with XML directly //! take it upon themselves to deal with XML directly? //! If we permitted retrieving raw escaped [`SymbolId`]s, //! then we run the risk of that value being used to construct a XIR //! stream and be subsequently double-encoded upon writing. use crate::sym::{ GlobalSymbolInternBytes, GlobalSymbolInternUnchecked, GlobalSymbolResolve, SymbolId, }; use std::borrow::Cow; use super::Error; /// XIR escaper and unescaper. /// /// Escapers are responsible for parsing XML escape sequences as necessary /// on read, /// and properly escaping characters on write. /// This is the only part of the system defending XIR against XML /// injection. /// /// Escapers must use interior mutability for any internal state /// (e.g. caching), /// since multiple readers and writers will require references. pub trait Escaper: Default { /// Escape raw bytes such that they become suitable for writing into an /// XML document as text. /// /// This value must be escaped such that subsequence unescaping /// (using [`unescape_bytes`](Escaper::unescape_bytes)) /// will result in the same value. fn escape_bytes(value: &[u8]) -> Cow<[u8]>; /// Unescape raw bytes such that any relevant escape sequences are /// parsed into their text representation. fn unescape_bytes(value: &[u8]) -> Result, Error>; /// Escape the given symbol and produce a [`SymbolId`] representing /// the escaped value suitable for writing. fn escape(&self, sym: SymbolId) -> SymbolId { match Self::escape_bytes(sym.lookup_str().as_bytes()) { // We got back what we sent in, // so this value is fixed. Cow::Borrowed(_) => sym, // The value changed, // so we must allocate a new symbol. // SAFETY: The unescaped symbol is valid UTF-8 unless it was // unsafely allocated. // Given that escaping does not introduce any invalid UTF-8 // sequences // (as is trivially verified by reading its implementation), // we can skip the UTF-8 check. Cow::Owned(esc) => unsafe { esc[..].intern_utf8_unchecked() }, } } /// Unescape the provided raw value and return a [`SymbolId`] /// representing the unescaped value. fn unescape_intern<'a>( &self, escaped: &'a [u8], ) -> Result { Ok(match Self::unescape_bytes(escaped)? { // We got back what we sent in, // so this value is fixed. Cow::Borrowed(orig) => { debug_assert!(orig == escaped); orig.intern_utf8()? } // The value was rewritten, // meaning that the original was escaped. // We can't assume that it's valid UTF-8. Cow::Owned(unesc) => unesc.intern_utf8()?, }) } } /// Escape and unescape using [`quick_xml`]. #[derive(Debug, Clone, Copy, Default)] pub struct QuickXmlEscaper {} impl Escaper for QuickXmlEscaper { #[inline] fn escape_bytes(value: &[u8]) -> Cow<[u8]> { quick_xml::escape::escape(value) } #[inline] fn unescape_bytes(value: &[u8]) -> Result, Error> { // For some reason, // quick-xml has made EscapeError explicitly private to the crate, // and so it is opaque to us. // They have, however, // implemented `From for Error`, // which we will use here. Ok(quick_xml::escape::unescape(value) .map_err(quick_xml::Error::from)?) } } /// Perform no escaping or unescaping. /// /// _This should be removed after development of the XIR-based readers!_ #[cfg(not(feature = "wip-xmlo-xir-reader"))] #[derive(Debug, Clone, Copy, Default)] pub struct NullEscaper {} #[cfg(not(feature = "wip-xmlo-xir-reader"))] impl Escaper for NullEscaper { #[inline] fn escape_bytes(value: &[u8]) -> Cow<[u8]> { Cow::Borrowed(value) } #[inline] fn unescape_bytes(_value: &[u8]) -> Result, Error> { panic!("NullEscaper should not be used for unescaping") } } pub type DefaultEscaper = QuickXmlEscaper; #[cfg(test)] mod test { use super::*; use crate::sym::GlobalSymbolIntern; // Simple sanity check to ensure that the default escaper actually does // some sort of escaping. #[test] fn default_escaper_escapes() { let sut = DefaultEscaper::default(); assert_eq!( "foo<bar".intern(), sut.escape("foo