// String interner // // Copyright (C) 2014-2023 Ryan Specialty, LLC. // // This file is part of TAME. // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . //! Interners used to intern values as symbols. //! //! See the [parent module](super) for more information. //! //! //! Using Interners Directly (Without Global State) //! =============================================== //! Please do not do this unless you have a compelling use case and know //! what you are doing, //! including understanding how to mitigate mixing of [`SymbolId`]s, //! such as with newtypes or encapsulation. //! Otherwise, //! use the global interners instead, //! as documented in the [parent module](super). //! //! ``` //! use tamer::sym::{DefaultProgInterner, Interner, SymbolId}; //! //! // Inputs to be interned //! let a = "foo"; //! let b = &"foo".to_string(); //! let c = "foobar"; //! let d = &c[0..3]; //! //! // Interners employ interior mutability and so do not need to be //! // declared `mut` //! let interner = DefaultProgInterner::new(); //! //! let (ia, ib, ic, id) = ( //! interner.intern(a), //! interner.intern(b), //! interner.intern(c), //! interner.intern(d), //! ); //! //! assert_eq!(ia, ib); //! assert_eq!(ia, id); //! assert_eq!(ib, id); //! assert_ne!(ia, ic); //! //! // Only "foo" and "foobar" are interned //! assert_eq!(2, interner.len()); //! assert!(interner.contains("foo")); //! assert!(interner.contains("foobar")); //! assert!(!interner.contains("something else")); //! //! // Symbols can also be looked up by index. //! assert_eq!("foo", interner.index_lookup(ia).unwrap()); //! ``` use super::{SymbolId, SymbolIndexSize}; use crate::global; use bumpalo::Bump; use fxhash::FxBuildHasher; use std::cell::RefCell; use std::collections::HashMap; use std::convert::{TryFrom, TryInto}; use std::fmt::Debug; use std::hash::BuildHasher; use std::str::{from_utf8, from_utf8_unchecked, Utf8Error}; /// Create, store, compare, and retrieve interned values. /// /// Interners accept string slices and produce values of type [`SymbolId`]. /// The same [`SymbolId`] will always be returned for a given string, /// allowing symbols to be compared for equality cheaply by comparing /// integers. /// Symbol locations in memory are fixed for the lifetime of the interner, /// and can be retrieved as [`str`] using /// [`index_lookup`](Interner::index_lookup). /// /// If you care whether a value has been interned yet or not, /// see [`intern_soft`][Interner::intern_soft`] and /// [`contains`](Interner::contains). /// /// See the [module-level documentation](self) for an example. /// For interfaces to the global interners that indirectly use these /// methods, /// see the [parent module](super). pub trait Interner<'i, Ix: SymbolIndexSize> { /// Intern a string slice or return an existing [`SymbolId`]. /// /// If the provided string has already been interned, /// then an existing [`SymbolId`] will be returned. /// Otherwise, /// the string will be interned and a new [`SymbolId`] allocated. /// /// To retrieve an existing symbol _without_ interning, /// see [`intern_soft`](Interner::intern_soft). fn intern(&self, value: &str) -> SymbolId; /// Retrieve an existing intern for the provided string slice. /// /// Unlike [`intern`](Interner::intern), /// this will _not_ intern the string if it has not already been /// interned. fn intern_soft(&self, value: &str) -> Option>; /// Copy the provided slice into the intern pool and produce a symbol, /// but do not intern the symbol. /// /// The symbol will never compare equal to any other symbol, /// regardless of the underlying string. /// Consequently, /// this evades the cost of hashing the string, /// allowing for a [`SymbolId`] to be used in place of [`String`]. /// /// See "Uninterned Symbols" in the documentation of the /// [`sym` module](super) for more information. fn clone_uninterned(&self, value: &str) -> SymbolId; /// Determine whether the given value has already been interned. /// /// This is equivalent to `intern_soft(value).is_some()`. fn contains(&self, value: &str) -> bool; /// Number of interned strings in this interner's pool. /// /// This count will increase each time a unique string is interned. /// It does not increase when a string is already interned. fn len(&self) -> usize; /// Look up a symbol's string value by its [`SymbolId`]. /// /// This will always return a [`str`] as long as the provided /// `index` represents a symbol interned with this interner. /// If the index is not found, /// the result is [`None`]. /// /// [`str`] requires significantly more storage than an appropriate /// [`SymbolId`] and should only be used when a string value must be /// written (e.g. to a file or displayed to the user). fn index_lookup(&'i self, index: SymbolId) -> Option<&'i str>; /// Intern a byte slice as a UTF-8 string. /// /// This method is intended as a performance optimization to avoid /// unnecessary UTF-8 checks when a byte slice has already been /// interned. /// /// This first checks to see if the provided byte slice matches an /// existing intern, /// returning the symbol if found. /// This allows us to skip the cost of a UTF-8 check for strings that /// have already been encountered, /// since their presence in the pool means that the string was /// either /// (a) already interned as a valid UTF-8 string; or /// (b) was interned using an unsafe function. /// In the case of (b), /// the safety violation is the fault of the original caller, /// and there's nothing we can do about it now. /// /// Note that this optimization is only beneficial when a string has /// already been interned. /// To avoid the cost of UTF-8 checks entirely, /// see [`Interner::intern_utf8_unchecked`]. /// /// If the byte slice does not represent a valid UTF-8 string, /// a [`Utf8Error`] will be returned along with a reference to the /// provided byte string. /// The purpose of this pair is to simplify error conversions /// usingĀ `?` so that errors can contain additional context. fn intern_utf8<'a>( &self, value: &'a [u8], ) -> Result, (Utf8Error, &'a [u8])>; /// Intern an assumed-UTF-8 slice of bytes or return an existing /// [`SymbolId`]. /// /// Safety /// ====== /// This function is unsafe because it uses [`from_utf8_unchecked`]. /// It is provided for convenience when interning from trusted binary /// data /// (such as [object files][]). /// /// [object files]: crate::obj unsafe fn intern_utf8_unchecked(&self, value: &[u8]) -> SymbolId { self.intern(from_utf8_unchecked(value)) } /// Copy the provided assumed-UTF-8 slice of bytes into the intern pool /// and produce a symbol, /// but do not intern the symbol. /// /// See [`clone_uninterned`](Interner::clone_uninterned) for more /// information. /// /// Safety /// ====== /// This function is unsafe because it uses [`from_utf8_unchecked`]. /// It is provided for convenience when interning from trusted binary /// data /// (such as [object files][]). /// /// [object files]: crate::obj unsafe fn clone_uninterned_utf8_unchecked( &self, value: &[u8], ) -> SymbolId { self.clone_uninterned(from_utf8_unchecked(value)) } } /// An interner backed by an [arena](bumpalo). /// /// Since all symbols exist until the interner itself is freed, /// an arena is a much more efficient and appropriate memory allocation /// strategy. /// This also provides a stable location in memory for symbol data. /// /// For the recommended configuration, /// see [`DefaultInterner`]. /// /// See the [module-level documentation](self) for examples and more /// information on how to use this interner. pub struct ArenaInterner<'i, S, Ix = global::ProgSymSize> where S: BuildHasher + Default, Ix: SymbolIndexSize, { /// Storage for interned strings. arena: Bump, /// Interned strings by [`SymbolId`]. /// /// The first index must always be populated during initialization to /// ensure that [`SymbolId`] will never beĀ `0`. /// /// These string slices are stored in `arena`. strings: RefCell>, /// Map of interned strings to their respective [`SymbolId`]. /// /// This allows us to determine whether a string has already been /// interned and, if so, to return its corresponding symbol. map: RefCell, S>>, } impl<'i, S, Ix> ArenaInterner<'i, S, Ix> where S: BuildHasher + Default, Ix: SymbolIndexSize, >::Error: Debug, { /// Initialize a new interner with no initial capacity. /// /// Prefer [`with_capacity`](ArenaInterner::with_capacity) when possible. #[inline] pub fn new() -> Self { Self::with_capacity(0) } /// Initialize a new interner with an initial capacity for the /// underlying [`HashMap`]. /// /// The given `capacity` has no affect on arena allocation. /// Specifying initial capacity is important only for the map of strings /// to symbols because it will reallocate and re-hash its contents /// once capacity is exceeded. /// See benchmarks. /// /// If reallocation is a major concern, /// a [consistent hashing algorithm][consistent] could be considered, /// but the implementation will still incur the cost of copying /// the [`HashMap`]'s contents to a new location in memory. /// /// [consistent]: https://en.wikipedia.org/wiki/Consistent_hashing #[inline] pub fn with_capacity(capacity: usize) -> Self { let mut strings = Vec::<_>::with_capacity(capacity); // The first index is not used since SymbolId cannot be 0. strings.push(""); Self { arena: Bump::new(), strings: RefCell::new(strings), map: RefCell::new(HashMap::with_capacity_and_hasher( capacity, Default::default(), )), } } #[inline] fn get_next_symbol_id(syms: &mut Vec<&'i str>) -> SymbolId { let next_index: Ix = syms .len() .try_into() .expect("internal error: SymbolId range exhausted"); // This is not actually unsafe because next_index is always >0 // from initialization. debug_assert!(Ix::new(next_index).is_some()); // != 0 check unsafe { SymbolId::from_int_unchecked(next_index) } } #[inline] fn copy_slice_into_arena(&self, value: &str) -> &'i str { unsafe { &*(from_utf8_unchecked( self.arena.alloc_slice_clone(value.as_bytes()), ) as *const str) } } /// Intern the provided value without looking for an existing intern. /// /// _This is an internal function that should only be used after having /// already checked for an existing intern._ /// It exists only to share common logic across methods. #[inline] fn intern_without_lookup(&self, value: &str) -> SymbolId { let mut syms = self.strings.borrow_mut(); let id = Self::get_next_symbol_id(&mut syms); let clone = self.copy_slice_into_arena(value); self.map.borrow_mut().insert(clone, id); syms.push(clone); id } } impl<'i, S, Ix> Interner<'i, Ix> for ArenaInterner<'i, S, Ix> where S: BuildHasher + Default, Ix: SymbolIndexSize, >::Error: Debug, { fn intern(&self, value: &str) -> SymbolId { if let Some(sym) = self.map.borrow().get(value) { return *sym; } self.intern_without_lookup(value) } #[inline] fn intern_soft(&self, value: &str) -> Option> { self.map.borrow().get(value).map(|sym| *sym) } fn clone_uninterned(&self, value: &str) -> SymbolId { let mut syms = self.strings.borrow_mut(); let id = Self::get_next_symbol_id(&mut syms); syms.push(self.copy_slice_into_arena(value)); id } fn intern_utf8<'a>( &self, value: &'a [u8], ) -> Result, (Utf8Error, &'a [u8])> { // Check the raw byte slice _before_ performing a UTF-8 check. // Note that `from_utf8_unchecked` is simply a transmute, // so this check incurs only a hashing cost. if let Some(sym) = self.map.borrow().get( // SAFETY: This is only being used to check if the byte slice // matches an existing intern, which must them already be UTF-8 // (unless an unsafe method was used to add it to begin with). unsafe { from_utf8_unchecked(value) }, ) { return Ok(*sym); } // The string is not yet interned, so we must perform a UTF-8 check // and can then proceed to intern as we normally would. // This does incur a double hashing cost, // just like `intern`. Ok(self.intern_without_lookup( from_utf8(value).map_err(|err| (err, value))?, )) } #[inline] fn contains(&self, value: &str) -> bool { self.map.borrow().contains_key(value) } #[inline] fn len(&self) -> usize { self.map.borrow().len() } fn index_lookup(&'i self, index: SymbolId) -> Option<&'i str> { self.strings.borrow().get(index.as_usize()).map(|str| *str) } } /// Interner using the [Fx Hash][fxhash] hashing function. /// /// _This is currently the hash function used by [`DefaultInterner`]._ /// /// If denial of service is not a concern, /// then this will outperform the default /// [`DefaultHasher`](std::collections::hash_map::DefaultHasher) /// (which uses SipHash at the time of writing). /// /// See intern benchmarks for a comparison. pub type FxArenaInterner<'i, Ix = global::ProgSymSize> = ArenaInterner<'i, FxBuildHasher, Ix>; /// Recommended [`Interner`] and configuration (size-agnostic). /// /// The choice of this default relies on the assumption that /// denial-of-service attacks against the hash function are not a /// concern. /// /// For more information on the hashing algorithm, /// see [`FxArenaInterner`]. pub type DefaultInterner<'i, Ix = global::ProgSymSize> = FxArenaInterner<'i, Ix>; /// Recommended [`Interner`] and configuration for compilers and linkers /// processing one or more packages. pub type DefaultProgInterner<'i> = DefaultInterner<'i, global::ProgSymSize>; // Note that these tests assert on standalone interners, not on the globals; // see the `global` sibling package for those tests. #[cfg(test)] mod test { use super::*; type Sut<'i> = DefaultInterner<'i>; #[test] fn recognizes_equal_strings() { let a = "foo"; let b = a.to_string(); let c = "bar"; let d = c.to_string(); let sut = Sut::new(); let (ia, ib, ic, id) = (sut.intern(a), sut.intern(&b), sut.intern(c), sut.intern(&d)); assert_eq!(ia, ib); assert_eq!(ic, id); assert_ne!(ia, ic); } #[test] fn symbol_id_increases_with_each_new_intern() { let sut = Sut::new(); // Remember that identifiers begin at 1 assert_eq!( SymbolId::test_from_int(1), sut.intern("foo"), "First index should be 1" ); assert_eq!( SymbolId::test_from_int(1), sut.intern("foo"), "Index should not increment for already-interned symbols" ); assert_eq!( SymbolId::test_from_int(2), sut.intern("bar"), "Index should increment for new symbols" ); } #[test] fn length_increases_with_each_new_intern() { let sut = Sut::new(); assert_eq!(0, sut.len(), "invalid empty len"); sut.intern("foo"); assert_eq!(1, sut.len(), "increment len"); // duplicate sut.intern("foo"); assert_eq!(1, sut.len(), "do not increment len on duplicates"); sut.intern("bar"); assert_eq!(2, sut.len(), "increment len (2)"); } #[test] fn can_check_wither_string_is_interned() { let sut = Sut::new(); assert!(!sut.contains("foo"), "recognize missing value"); sut.intern("foo"); assert!(sut.contains("foo"), "recognize interned value"); } #[test] fn intern_soft() { let sut = Sut::new(); assert_eq!(None, sut.intern_soft("foo")); let foo = sut.intern("foo"); assert_eq!(Some(foo), sut.intern_soft("foo")); } #[test] fn uninterned_symbol_does_not_compare_equal_to_same_string() { let sut = Sut::new(); let s = "foo"; let interned = sut.intern(s); let uninterned = sut.clone_uninterned(s); // The symbols themselves will never be equal... assert_ne!(uninterned, interned); // ...but their underlying strings are. assert_eq!(sut.index_lookup(uninterned), sut.index_lookup(interned)); } // Unlike the previous test, this makes sure that allocating an // uninterned symbol is actually not being interned, in that interning // another symbol after that won't return an uninterned symbol. #[test] fn allocating_uninterned_symbol_does_not_intern() { let sut = Sut::new(); let s = "foo"; // Alloc unintenrned _first_ let uninterned1 = sut.clone_uninterned(s); let uninterned2 = sut.clone_uninterned(s); let interned1 = sut.intern(s); let interned2 = sut.intern(s); assert_ne!(uninterned1, interned1); assert_ne!(uninterned2, interned1); assert_ne!(uninterned1, uninterned2); // But we shouldn't have tainted normal interner behavior. assert_eq!(interned1, interned2); } #[test] fn new_with_capacity() { let n = 512; let sut = Sut::with_capacity(n); // note that this is not publicly available assert!(sut.map.borrow().capacity() >= n); } #[test] fn intern_utf8_unchecked() { let sut = Sut::new(); let a = sut.intern("foo"); let b = unsafe { sut.intern_utf8_unchecked(b"foo") }; assert_eq!(a, b); } #[test] fn lookup_symbol_by_index() { let sut = Sut::new(); // Symbol does not yet exist. assert!(sut.index_lookup(SymbolId::test_from_int(1)).is_none()); let sym = sut.intern("foo"); assert_eq!("foo", sut.index_lookup(sym).unwrap()); } #[test] fn intern_utf8_with_new_valid_utf8_bytes() { let sut = Sut::new(); let bytes = "valid".as_bytes(); let sym = sut.intern_utf8(bytes).expect("unexpected failure"); assert_eq!(sut.intern("valid"), sym); } #[test] fn intern_utf8_with_existing_valid_utf8_bytes() { let sut = Sut::new(); let s = "valid"; // Intern normally _first_ so that the `intern_utf8` call will // return an existing intern. sut.intern(s); let sym = sut.intern_utf8(s.as_bytes()).expect("unexpected failure"); assert_eq!(sut.intern("valid"), sym); } #[test] fn intern_utf8_fails_with_invalid_utf8_bytes() { let sut = Sut::new(); // Invalid two-byte encoding. let bytes = &[0b11000000u8]; let result = sut.intern_utf8(bytes); match (result, from_utf8(bytes)) { (_, Ok(_)) => panic!("test string is valid UTF-8"), (Ok(_), _) => panic!("expected error"), (Err((given_err, given_u8)), Err(expected)) => { assert_eq!(given_u8, bytes); assert_eq!(given_err, expected); } } } }