tamer: xir::escape::CachingEscaper: New Escaper

As promised, this will cache previously seen escaped/unescaped values by creating a two-way mapping between them. DEV-11081
2021-11-12 16:07:57 -05:00 · 2021-11-12 16:07:57 -05:00 · d710437ee4
parent 27ba03b59b
commit d710437ee4
5 changed files with 245 additions and 47 deletions
--- a/tamer/src/ld/poc.rs
+++ b/tamer/src/ld/poc.rs
@ -60,18 +60,7 @@ type LinkerAsgBuilderState =
 pub fn xmle(package_path: &str, output: &str) -> Result<(), Box<dyn Error>> {
    let mut fs = VisitOnceFilesystem::new();
    let mut depgraph = LinkerAsg::with_capacity(65536, 65536);
-    let escaper = {
-        #[cfg(feature = "wip-xmlo-xir-reader")]
-        {
-            DefaultEscaper::default()
-        }
-        #[cfg(not(feature = "wip-xmlo-xir-reader"))]
-        {
-            // The original POC linker did nothing with escape sequences,
-            //   since it simply shuffles data around and re-outputs as XML.
-            crate::xir::NullEscaper::default()
-        }
-    };
+    let escaper = DefaultEscaper::default();

    let state = load_xmlo(
        package_path,
--- a/tamer/src/xir.rs
+++ b/tamer/src/xir.rs
@ -67,7 +67,7 @@ mod error;
 pub use error::Error;

 mod escape;
-pub use escape::{DefaultEscaper, Escaper, NullEscaper};
+pub use escape::{DefaultEscaper, Escaper};

 pub mod iter;
 pub mod pred;
--- a/tamer/src/xir/escape.rs
+++ b/tamer/src/xir/escape.rs
@ -28,6 +28,10 @@
 //!   meaning that multiple readers and writers will require references
 //!     to the [`Escaper`].
 //!
+//! For more information on caching employed by TAMER to improve
+//!   performance,
+//!     see [`CachingEscaper`].
+//!
 //! Safety
 //! ======
 //! The purpose of this type is to provide safety against XML injection by
@ -50,11 +54,13 @@
 //!   then we run the risk of that value being used to construct a XIR
 //!   stream and be subsequently double-encoded upon writing.

+use fxhash::FxHashMap;
+
 use crate::sym::{
    GlobalSymbolInternBytes, GlobalSymbolInternUnchecked, GlobalSymbolResolve,
    SymbolId,
 };
-use std::borrow::Cow;
+use std::{borrow::Cow, cell::RefCell, collections::hash_map::Entry};

 use super::Error;

@ -84,6 +90,7 @@ pub trait Escaper: Default {

    /// Escape the given symbol and produce a [`SymbolId`] representing
    ///   the escaped value suitable for writing.
+    #[inline]
    fn escape(&self, sym: SymbolId) -> SymbolId {
        match Self::escape_bytes(sym.lookup_str().as_bytes()) {
            // We got back what we sent in,
@ -104,23 +111,20 @@ pub trait Escaper: Default {

    /// Unescape the provided raw value and return a [`SymbolId`]
    ///   representing the unescaped value.
-    fn unescape_intern<'a>(
-        &self,
-        escaped: &'a [u8],
-    ) -> Result<SymbolId, Error> {
-        Ok(match Self::unescape_bytes(escaped)? {
-            // We got back what we sent in,
-            //   so this value is fixed.
-            Cow::Borrowed(orig) => {
-                debug_assert!(orig == escaped);
-                orig.intern_utf8()?
-            }
+    #[inline]
+    fn unescape(&self, escaped: SymbolId) -> Result<SymbolId, Error> {
+        Ok(
+            match Self::unescape_bytes(escaped.lookup_str().as_bytes())? {
+                // We got back what we sent in,
+                //   so this value is fixed.
+                Cow::Borrowed(_) => escaped,

-            // The value was rewritten,
-            //   meaning that the original was escaped.
-            // We can't assume that it's valid UTF-8.
-            Cow::Owned(unesc) => unesc.intern_utf8()?,
-        })
+                // The value was rewritten,
+                //   meaning that the original was escaped.
+                // We can't assume that it's valid UTF-8.
+                Cow::Owned(unesc) => unesc.intern_utf8()?,
+            },
+        )
    }
 }

@ -147,6 +151,118 @@ impl Escaper for QuickXmlEscaper {
    }
 }

+/// Cache escaped and unescaped [`SymbolId`]s.
+///
+/// _This cache should be shared between all readers and writers._
+///
+/// This takes advantage of the efficiency of the string internment system
+///   to avoid the costs of escaping/unescaping if we've already encountered
+///   the requested symbol previously.
+///
+/// There are a number of ways this is beneficial:
+///
+/// When a string is read,
+///   its escaped [`SymbolId`] and associated unescaped [`SymbolId`] are
+///   stored in a two-way mapping.
+/// If another reader encounters the same [`SymbolId`],
+///   it does not need to spend the time attempting to unescape it,
+///   and will simply re-use the existing cached [`SymbolId`].
+///
+/// When a writer encounters a [`SymbolId`]
+///   (representing the _unescaped_ value),
+///   it is able to retrieve from cache the escaped [`SymbolId`] that was
+///     originally encountered by a reader,
+///       thereby saving it the time of re-escaping.
+///
+/// Escaped Representation
+/// ======================
+/// Note that this means that the escaped value will be the same as the
+///   _first_ time that unescaped value was read
+///   (there are many different ways to escape the same value);
+///     an [`Escaper`] _does not_ guarantee a canonical escaped
+///     representation.
+///
+/// While this appears to add a source of nondeterminism that undermines
+///   reproducible builds,
+///     it is mitigated by applying ordering to how files are loaded,
+///     which is necessary to mitigate much more serious sources of
+///       filesystem-based nondeterminism.
+///
+/// If this is burdensome in the future
+///   (e.g. when writing a code formatter that needs to retain escapes),
+///   there are other potential mitigations,
+///     including modifying [`Escaper`] to accept spans as context or
+///       augmenting XIR with an unescape hint.
+#[derive(Debug, Default)]
+pub struct CachingEscaper<S: Escaper> {
+    /// Inner [`Escaper`] to be invoked to populate the cache.
+    inner: S,
+    /// Map from unescaped [`SymbolId`]s to their escaped represeation.
+    toesc: RefCell<FxHashMap<SymbolId, SymbolId>>,
+    /// Map from escaped [`SymbolId`]s to their unescaped value.
+    tounesc: RefCell<FxHashMap<SymbolId, SymbolId>>,
+}
+
+impl<S: Escaper> CachingEscaper<S> {
+    pub fn new(inner: S) -> Self {
+        // TODO: Capacity that is at least double the length of the static
+        // symbols.
+        Self {
+            inner,
+            ..Default::default()
+        }
+    }
+
+    pub fn into_inner(self) -> S {
+        self.inner
+    }
+}
+
+impl<S: Escaper> Escaper for CachingEscaper<S> {
+    #[inline]
+    fn escape_bytes(value: &[u8]) -> Cow<[u8]> {
+        S::escape_bytes(value)
+    }
+
+    #[inline]
+    fn unescape_bytes(value: &[u8]) -> Result<Cow<[u8]>, Error> {
+        S::unescape_bytes(value)
+    }
+
+    #[inline]
+    fn escape(&self, unescaped: SymbolId) -> SymbolId {
+        *self.toesc.borrow_mut().entry(unescaped).or_insert_with(|| {
+            let escaped = self.inner.escape(unescaped);
+
+            // Later requests to unescape this newly escaped symbol will
+            //   yield the unescaped value provided here.
+            self.tounesc
+                .borrow_mut()
+                .entry(escaped)
+                .or_insert(unescaped);
+
+            escaped
+        })
+    }
+
+    #[inline]
+    fn unescape(&self, escaped: SymbolId) -> Result<SymbolId, Error> {
+        Ok(match self.tounesc.borrow_mut().entry(escaped) {
+            Entry::Occupied(unescaped) => *unescaped.get(),
+            Entry::Vacant(entry) => {
+                let unescaped = *entry.insert(self.inner.unescape(escaped)?);
+
+                // There are many escaped representations for the same
+                //   unescaped value.
+                // We will keep the first one that we encountered.
+                self.toesc.borrow_mut().entry(unescaped).or_insert(escaped);
+
+                unescaped
+            }
+        })
+    }
+}
+
 /// Perform no escaping or unescaping.
 ///
 /// _This should be removed after development of the XIR-based readers!_
@ -167,7 +283,10 @@ impl Escaper for NullEscaper {
    }
 }

-pub type DefaultEscaper = QuickXmlEscaper;
+#[cfg(feature = "wip-xmlo-xir-reader")]
+pub type DefaultEscaper = CachingEscaper<QuickXmlEscaper>;
+#[cfg(not(feature = "wip-xmlo-xir-reader"))]
+pub type DefaultEscaper = NullEscaper;

 #[cfg(test)]
 mod test {
@ -176,6 +295,7 @@ mod test {

    // Simple sanity check to ensure that the default escaper actually does
    //   some sort of escaping.
+    #[cfg(feature = "wip-xmlo-xir-reader")]
    #[test]
    fn default_escaper_escapes() {
        let sut = DefaultEscaper::default();
@ -185,4 +305,105 @@ mod test {
            sut.escape("foo<bar".intern()).into(),
        );
    }
+
+    mod cache {
+        use super::*;
+        use std::{collections::HashMap, result};
+
+        // Maintain counts of calls rather than providing stubs,
+        //   to avoid `RefCell<Rc<Refcell<Option<SymbolId>>>>` for
+        //   concurrent access.
+        #[derive(Debug, Default)]
+        struct StubEscaper {
+            escape_map: HashMap<SymbolId, SymbolId>,
+            unescape_map: HashMap<SymbolId, SymbolId>,
+            escape_count: RefCell<FxHashMap<SymbolId, usize>>,
+            unescape_count: RefCell<FxHashMap<SymbolId, usize>>,
+        }
+
+        impl Escaper for StubEscaper {
+            fn escape_bytes(_: &[u8]) -> Cow<[u8]> {
+                unreachable!("escape_bytes should not be called")
+            }
+
+            fn unescape_bytes(_: &[u8]) -> result::Result<Cow<[u8]>, Error> {
+                unreachable!("unescape_bytes should not be called")
+            }
+
+            fn escape(&self, given: SymbolId) -> SymbolId {
+                *self.escape_count.borrow_mut().entry(given).or_default() += 1;
+                *self.escape_map.get(&given).expect("unexpected escape")
+            }
+
+            fn unescape(&self, given: SymbolId) -> Result<SymbolId, Error> {
+                *self.unescape_count.borrow_mut().entry(given).or_default() +=
+                    1;
+                Ok(*self.unescape_map.get(&given).expect("unexpected unescape"))
+            }
+        }
+
+        #[test]
+        fn caching_escaper_unescape() {
+            let esc = "escaped".intern();
+            let unesc = "unescaped".intern();
+
+            let sut = CachingEscaper::new(StubEscaper {
+                escape_map: [(unesc, esc)].into(),
+                unescape_map: [(esc, unesc)].into(),
+                ..Default::default()
+            });
+
+            // Invoke unescape more than once to ensure caching occurs.
+            assert_eq!(sut.unescape(esc).unwrap(), unesc);
+            assert_eq!(sut.unescape(esc).unwrap(), unesc);
+
+            // And escape once, using a previous unescaped result.
+            assert_eq!(sut.escape(unesc), esc);
+
+            // We should have invoked the underlying escaper only once for
+            //   the unescape operation.
+            let stub = sut.into_inner();
+            assert_eq!(stub.unescape_count.borrow().get(&esc), Some(&1));
+
+            // And, having previously encountered the escaped value from
+            //   unescaping,
+            //     we should _not_ have invoked the escaper _at all_ when we
+            //     escaped the value.
+            // This means that previously encountered escapes will always
+            //   take precedence over any explicit escape result.
+            assert_eq!(stub.escape_count.borrow().get(&unesc), None);
+        }
+
+        #[test]
+        fn caching_escaper_escape() {
+            let esc = "escaped".intern();
+            let unesc = "unescaped".intern();
+
+            let sut = CachingEscaper::new(StubEscaper {
+                escape_map: [(unesc, esc)].into(),
+                unescape_map: [(esc, unesc)].into(),
+                ..Default::default()
+            });
+
+            // Invoke escape more than once to ensure caching occurs.
+            assert_eq!(sut.escape(unesc), esc);
+            assert_eq!(sut.escape(unesc), esc);
+
+            // And unescape once, using a previous escaped result.
+            assert_eq!(sut.unescape(esc).unwrap(), unesc);
+
+            // We should have invoked the underlying escaper only once for
+            //   the escape operation.
+            let stub = sut.into_inner();
+            assert_eq!(stub.escape_count.borrow().get(&unesc), Some(&1));
+
+            // And, having previously encountered the unescaped value from
+            //   escaping,
+            //     we should _not_ have invoked the escaper _at all_ when we
+            //     unescaped the value.
+            // This means that previously encountered unescapes will always
+            //   take precedence over any explicit unescape result.
+            assert_eq!(stub.unescape_count.borrow().get(&esc), None);
+        }
+    }
 }
--- a/tamer/src/xir/reader.rs
+++ b/tamer/src/xir/reader.rs
@ -214,7 +214,8 @@ impl<'s, B: BufRead, S: Escaper> XmlXirReader<'s, B, S> {
            //     that's okay as long as we can read it again,
            //       but we probably should still throw an error if we
            //       encounter such a situation.
-            let value = escaper.unescape_intern(attr.value.as_ref())?.into();
+            let value =
+                escaper.unescape(attr.value.as_ref().intern_utf8()?)?.into();

            tokbuf.push_front(Token::AttrName(name, DUMMY_SPAN));
            tokbuf.push_front(Token::AttrValue(value, DUMMY_SPAN));
--- a/tamer/src/xir/reader/test.rs
+++ b/tamer/src/xir/reader/test.rs
@ -489,21 +489,8 @@ fn attr_value_invalid_utf8() {
    match result {
        Ok(_) => panic!("expected failure"),
        Err(Error::InvalidUtf8(_, bytes)) => {
-            assert_eq!(
-                bytes,
-                &[
-                    b'b',
-                    b'a',
-                    b'd',
-                    INVALID_UTF8_BYTE,
-                    b':',
-                    b'U',
-                    b'N',
-                    b'E',
-                    b'S',
-                    b'C'
-                ]
-            );
+            // Doesn't make it to the Escaper.
+            assert_eq!(bytes, &[b'b', b'a', b'd', INVALID_UTF8_BYTE]);
        }
        _ => panic!("unexpected failure"),
    }