tame/tamer/src/xir/escape.rs

// XIR string escaping and unescaping
//
//  Copyright (C) 2014-2022 Ryan Specialty Group, LLC.
//
//  This file is part of TAME.
//
//  This program is free software: you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation, either version 3 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program.  If not, see <http://www.gnu.org/licenses/>.

//! Escaping and unescaping for writers and readers respectively.
//!
//! An [`Escaper`] is required by XIR readers and writers.
//! An escaper may perform caching to avoid unnecessary work,
//!   so it is advantageous to provide the _same_ instance to all readers
//!   and writers.
//! [`Escaper`] methods use interior mutability to facilitate this,
//!   since TAMER streams lowering operations where possible,
//!   meaning that multiple readers and writers will require references
//!     to the [`Escaper`].
//!
//! For more information on caching employed by TAMER to improve
//!   performance,
//!     see [`CachingEscaper`].
//!
//! Safety
//! ======
//! The purpose of this type is to provide safety against XML injection by
//!   encapsulating all responsibility within a single object.
//! The idea is simple:
//!   a [`SymbolId`] _always_ represents an unescaped string.
//! This prevents, primarily,
//!
//!  1. XML injection (via lack of escaping); and
//!  2. Erroneous multiple escape/unescape.
//!
//! This module is the _only_ part of the system that has access to raw,
//!   escaped values.
//! Outside of this module,
//!   it is assumed that the rest of the system is working with _unescaped_
//!   values---afterall,
//!     why would other parts of the system not dealing with XML directly
//!     take it upon themselves to deal with XML directly?
//! If we permitted retrieving raw escaped [`SymbolId`]s,
//!   then we run the risk of that value being used to construct a XIR
//!   stream and be subsequently double-encoded upon writing.

use fxhash::FxHashMap;

use crate::sym::{
    st::ST_COUNT, GlobalSymbolInternBytes, GlobalSymbolInternUnchecked,
    GlobalSymbolResolve, SymbolId,
};
use std::{borrow::Cow, cell::RefCell, collections::hash_map::Entry};

use super::error::SpanlessError;

/// XIR escaper and unescaper.
///
/// Escapers are responsible for parsing XML escape sequences as necessary
///   on read,
///     and properly escaping characters on write.
/// This is the only part of the system defending XIR against XML
///   injection.
///
/// Escapers must use interior mutability for any internal state
///   (e.g. caching),
///     since multiple readers and writers will require references.
pub trait Escaper: Default {
    /// Escape raw bytes such that they become suitable for writing into an
    ///   XML document as text.
    ///
    /// This value must be escaped such that subsequence unescaping
    ///   (using [`unescape_bytes`](Escaper::unescape_bytes))
    ///   will result in the same value.
    fn escape_bytes(value: &[u8]) -> Cow<[u8]>;

    /// Unescape raw bytes such that any relevant escape sequences are
    ///   parsed into their text representation.
    fn unescape_bytes(value: &[u8]) -> Result<Cow<[u8]>, SpanlessError>;

    /// Escape the given symbol and produce a [`SymbolId`] representing
    ///   the escaped value suitable for writing.
    #[inline]
    fn escape(&self, sym: SymbolId) -> SymbolId {
        match Self::escape_bytes(sym.lookup_str().as_bytes()) {
            // We got back what we sent in,
            //   so this value is fixed.
            Cow::Borrowed(_) => sym,

            // The value changed,
            //   so we must allocate a new symbol.
            // SAFETY: The unescaped symbol is valid UTF-8 unless it was
            //     unsafely allocated.
            //   Given that escaping does not introduce any invalid UTF-8
            //     sequences
            //       (as is trivially verified by reading its implementation),
            //       we can skip the UTF-8 check.
            Cow::Owned(esc) => unsafe { esc[..].intern_utf8_unchecked() },
        }
    }

    /// Unescape the provided raw value and return a [`SymbolId`]
    ///   representing the unescaped value.
    #[inline]
    fn unescape(&self, escaped: SymbolId) -> Result<SymbolId, SpanlessError> {
        Ok(
            match Self::unescape_bytes(escaped.lookup_str().as_bytes())? {
                // We got back what we sent in,
                //   so this value is fixed.
                Cow::Borrowed(_) => escaped,

                // The value was rewritten,
                //   meaning that the original was escaped.
                // We can't assume that it's valid UTF-8.
                Cow::Owned(unesc) => unesc.intern_utf8()?,
            },
        )
    }
}

/// Escape and unescape using [`quick_xml`].
#[derive(Debug, Clone, Copy, Default)]
pub struct QuickXmlEscaper {}

impl Escaper for QuickXmlEscaper {
    #[inline]
    fn escape_bytes(value: &[u8]) -> Cow<[u8]> {
        quick_xml::escape::escape(value)
    }

    #[inline]
    fn unescape_bytes(value: &[u8]) -> Result<Cow<[u8]>, SpanlessError> {
        // For some reason,
        //   quick-xml has made EscapeError explicitly private to the crate,
        //     and so it is opaque to us.
        // They have, however,
        //   implemented `From<EscapeError> for Error`,
        //   which we will use here.
        Ok(quick_xml::escape::unescape(value)
            .map_err(quick_xml::Error::from)?)
    }
}

/// Cache escaped and unescaped [`SymbolId`]s.
///
/// _This cache should be shared between all readers and writers._
///
/// This takes advantage of the efficiency of the string internment system
///   to avoid the costs of escaping/unescaping if we've already encountered
///   the requested symbol previously.
///
/// There are a number of ways this is beneficial:
///
/// When a string is read,
///   its escaped [`SymbolId`] and associated unescaped [`SymbolId`] are
///   stored in a two-way mapping.
/// If another reader encounters the same [`SymbolId`],
///   it does not need to spend the time attempting to unescape it,
///   and will simply re-use the existing cached [`SymbolId`].
///
/// When a writer encounters a [`SymbolId`]
///   (representing the _unescaped_ value),
///   it is able to retrieve from cache the escaped [`SymbolId`] that was
///     originally encountered by a reader,
///       thereby saving it the time of re-escaping.
///
/// Escaped Representation
/// ======================
/// Note that this means that the escaped value will be the same as the
///   _first_ time that unescaped value was read
///   (there are many different ways to escape the same value);
///     an [`Escaper`] _does not_ guarantee a canonical escaped
///     representation.
///
/// While this appears to add a source of nondeterminism that undermines
///   reproducible builds,
///     it is mitigated by applying ordering to how files are loaded,
///     which is necessary to mitigate much more serious sources of
///       filesystem-based nondeterminism.
///
/// If this is burdensome in the future
///   (e.g. when writing a code formatter that needs to retain escapes),
///   there are other potential mitigations,
///     including modifying [`Escaper`] to accept spans as context or
///       augmenting XIR with an unescape hint.
#[derive(Debug, Default)]
pub struct CachingEscaper<S: Escaper> {
    /// Inner [`Escaper`] to be invoked to populate the cache.
    inner: S,
    /// Map from unescaped [`SymbolId`]s to their escaped represeation.
    toesc: RefCell<FxHashMap<SymbolId, SymbolId>>,
    /// Map from escaped [`SymbolId`]s to their unescaped value.
    tounesc: RefCell<FxHashMap<SymbolId, SymbolId>>,
}

impl<S: Escaper> CachingEscaper<S> {
    pub fn new(inner: S) -> Self {
        // We know we'll encounter more than the statically allocated
        //   symbols,
        //     given that we'll be reading parsing XML documents.
        // This can be adjusted as needed after profiling.
        let capacity = ST_COUNT * 2;

        Self {
            inner,
            toesc: RefCell::new(FxHashMap::with_capacity_and_hasher(
                capacity,
                Default::default(),
            )),
            tounesc: RefCell::new(FxHashMap::with_capacity_and_hasher(
                capacity,
                Default::default(),
            )),
        }
    }

    pub fn into_inner(self) -> S {
        self.inner
    }
}

impl<S: Escaper> Escaper for CachingEscaper<S> {
    #[inline]
    fn escape_bytes(value: &[u8]) -> Cow<[u8]> {
        S::escape_bytes(value)
    }

    #[inline]
    fn unescape_bytes(value: &[u8]) -> Result<Cow<[u8]>, SpanlessError> {
        S::unescape_bytes(value)
    }

    #[inline]
    fn escape(&self, unescaped: SymbolId) -> SymbolId {
        *self.toesc.borrow_mut().entry(unescaped).or_insert_with(|| {
            let escaped = self.inner.escape(unescaped);

            // Later requests to unescape this newly escaped symbol will
            //   yield the unescaped value provided here.
            self.tounesc
                .borrow_mut()
                .entry(escaped)
                .or_insert(unescaped);

            escaped
        })
    }

    #[inline]
    fn unescape(&self, escaped: SymbolId) -> Result<SymbolId, SpanlessError> {
        Ok(match self.tounesc.borrow_mut().entry(escaped) {
            Entry::Occupied(unescaped) => *unescaped.get(),
            Entry::Vacant(entry) => {
                let unescaped = *entry.insert(self.inner.unescape(escaped)?);

                // There are many escaped representations for the same
                //   unescaped value.
                // We will keep the first one that we encountered.
                self.toesc.borrow_mut().entry(unescaped).or_insert(escaped);

                unescaped
            }
        })
    }
}

pub type DefaultEscaper = CachingEscaper<QuickXmlEscaper>;

#[cfg(test)]
mod test {
    use super::*;
    use crate::sym::GlobalSymbolIntern;

    mod cache {
        use super::*;
        use std::{collections::HashMap, result};

        // Maintain counts of calls rather than providing stubs,
        //   to avoid `RefCell<Rc<Refcell<Option<SymbolId>>>>` for
        //   concurrent access.
        #[derive(Debug, Default)]
        struct StubEscaper {
            escape_map: HashMap<SymbolId, SymbolId>,
            unescape_map: HashMap<SymbolId, SymbolId>,
            escape_count: RefCell<FxHashMap<SymbolId, usize>>,
            unescape_count: RefCell<FxHashMap<SymbolId, usize>>,
        }

        impl Escaper for StubEscaper {
            fn escape_bytes(_: &[u8]) -> Cow<[u8]> {
                unreachable!("escape_bytes should not be called")
            }

            fn unescape_bytes(
                _: &[u8],
            ) -> result::Result<Cow<[u8]>, SpanlessError> {
                unreachable!("unescape_bytes should not be called")
            }

            fn escape(&self, given: SymbolId) -> SymbolId {
                *self.escape_count.borrow_mut().entry(given).or_default() += 1;
                *self.escape_map.get(&given).expect("unexpected escape")
            }

            fn unescape(
                &self,
                given: SymbolId,
            ) -> Result<SymbolId, SpanlessError> {
                *self.unescape_count.borrow_mut().entry(given).or_default() +=
                    1;
                Ok(*self.unescape_map.get(&given).expect("unexpected unescape"))
            }
        }

        #[test]
        fn caching_escaper_unescape() {
            let esc = "escaped".intern();
            let unesc = "unescaped".intern();

            let sut = CachingEscaper::new(StubEscaper {
                escape_map: [(unesc, esc)].into(),
                unescape_map: [(esc, unesc)].into(),
                ..Default::default()
            });

            // Invoke unescape more than once to ensure caching occurs.
            assert_eq!(sut.unescape(esc).unwrap(), unesc);
            assert_eq!(sut.unescape(esc).unwrap(), unesc);

            // And escape once, using a previous unescaped result.
            assert_eq!(sut.escape(unesc), esc);

            // We should have invoked the underlying escaper only once for
            //   the unescape operation.
            let stub = sut.into_inner();
            assert_eq!(stub.unescape_count.borrow().get(&esc), Some(&1));

            // And, having previously encountered the escaped value from
            //   unescaping,
            //     we should _not_ have invoked the escaper _at all_ when we
            //     escaped the value.
            // This means that previously encountered escapes will always
            //   take precedence over any explicit escape result.
            assert_eq!(stub.escape_count.borrow().get(&unesc), None);
        }

        #[test]
        fn caching_escaper_escape() {
            let esc = "escaped".intern();
            let unesc = "unescaped".intern();

            let sut = CachingEscaper::new(StubEscaper {
                escape_map: [(unesc, esc)].into(),
                unescape_map: [(esc, unesc)].into(),
                ..Default::default()
            });

            // Invoke escape more than once to ensure caching occurs.
            assert_eq!(sut.escape(unesc), esc);
            assert_eq!(sut.escape(unesc), esc);

            // And unescape once, using a previous escaped result.
            assert_eq!(sut.unescape(esc).unwrap(), unesc);

            // We should have invoked the underlying escaper only once for
            //   the escape operation.
            let stub = sut.into_inner();
            assert_eq!(stub.escape_count.borrow().get(&unesc), Some(&1));

            // And, having previously encountered the unescaped value from
            //   escaping,
            //     we should _not_ have invoked the escaper _at all_ when we
            //     unescaped the value.
            // This means that previously encountered unescapes will always
            //   take precedence over any explicit unescape result.
            assert_eq!(stub.unescape_count.borrow().get(&esc), None);
        }
    }
}