// String interner // // Copyright (C) 2014-2021 Ryan Specialty Group, LLC. // // This file is part of TAME. // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . //! Interners used to intern values as symbols. //! //! See the [parent module](super) for more information. use super::{Symbol, SymbolId, SymbolIndexSize}; use crate::global; use bumpalo::Bump; use fxhash::FxBuildHasher; use std::cell::RefCell; use std::collections::HashMap; use std::convert::{TryFrom, TryInto}; use std::fmt::Debug; use std::hash::BuildHasher; /// Create, store, compare, and retrieve [`Symbol`] values. /// /// Interners accept string slices and produce values of type [`Symbol`]. /// A reference to the same [`Symbol`] will always be returned for a given /// string, /// allowing symbols to be compared for equality cheaply by comparing /// pointers. /// Symbol locations in memory are fixed for the lifetime of the interner. /// /// If you care whether a value has been interned yet or not, /// see [`intern_soft`][Interner::intern_soft`] and /// [`contains`](Interner::contains). /// /// See the [module-level documentation](self) for an example. pub trait Interner<'i, Ix: SymbolIndexSize> { /// Intern a string slice or return an existing [`Symbol`]. /// /// If the provided string has already been interned, /// then a reference to the existing [`Symbol`] will be returned. /// Otherwise, /// the string will be interned and a new [`Symbol`] created. /// /// The lifetime of the returned symbol is bound to the lifetime of the /// underlying intern pool. /// /// To retrieve an existing symbol _without_ interning, /// see [`intern_soft`](Interner::intern_soft). fn intern(&'i self, value: &str) -> &'i Symbol<'i, Ix>; /// Retrieve an existing intern for the string slice `s`. /// /// Unlike [`intern`](Interner::intern), /// this will _not_ intern the string if it has not already been /// interned. fn intern_soft(&'i self, value: &str) -> Option<&'i Symbol<'i, Ix>>; /// Determine whether the given value has already been interned. fn contains(&self, value: &str) -> bool; /// Number of interned strings. /// /// This count will increase each time a unique string is interned. /// It does not increase when a string is already interned. fn len(&self) -> usize; /// Look up a previously interned [`Symbol`] by its [`SymbolId`]. /// /// This will always return a [`Symbol`] as long as the provided `index` /// represents a symbol interned with this interner. /// If the index is not found, /// the result is [`None`]. /// /// This method is most useful when storing [`Symbol`] is not possible /// or desirable. /// For example, /// borrowed [`Symbol`] references require lifetimes, /// whereas [`SymbolId`] is both owned _and_ [`Copy`]. /// [`SymbolId`] is also much smaller than [`Symbol`]. fn index_lookup( &'i self, index: SymbolId, ) -> Option<&'i Symbol<'i, Ix>>; /// Intern an assumed-UTF8 slice of bytes or return an existing /// [`Symbol`]. /// /// Safety /// ====== /// This function is unsafe because it uses /// [`std::str::from_utf8_unchecked`]. /// It is provided for convenience when interning from trusted binary /// data /// (such as [object files][]). /// /// [object files]: crate::obj unsafe fn intern_utf8_unchecked( &'i self, value: &[u8], ) -> &'i Symbol<'i, Ix> { self.intern(std::str::from_utf8_unchecked(value)) } } /// An interner backed by an [arena](bumpalo). /// /// Since interns exist until the interner itself is freed, /// an arena is a much more efficient and appropriate memory allocation /// strategy. /// This further provides a stable location in memory for symbol data. /// /// For the recommended configuration, /// see [`DefaultInterner`]. /// /// See the [module-level documentation](self) for examples and more /// information on how to use this interner. pub struct ArenaInterner<'i, S, Ix> where S: BuildHasher + Default, Ix: SymbolIndexSize, { /// String and [`Symbol`] storage. arena: Bump, /// Symbol references by index. /// /// This vector enables looking up a [`Symbol`] using its /// [`SymbolId`]. /// /// The first index must always be populated during initialization to /// ensure that [`SymbolId`] will never beĀ `0`. indexes: RefCell>>, /// Map of interned strings to their respective [`Symbol`]. /// /// Both strings and symbols are allocated within `arena`. map: RefCell, S>>, } impl<'i, S, Ix> ArenaInterner<'i, S, Ix> where S: BuildHasher + Default, Ix: SymbolIndexSize, { /// Initialize a new interner with no initial capacity. /// /// Prefer [`with_capacity`](ArenaInterner::with_capacity) when possible. #[inline] pub fn new() -> Self { Self::with_capacity(0) } /// Initialize a new interner with an initial capacity for the /// underlying [`HashMap`]. /// /// The given `capacity` has no affect on arena allocation. /// Specifying initial capacity is important only for the map of strings /// to symbols because it will reallocate and re-hash its contents /// once capacity is exceeded. /// See benchmarks. /// /// If reallocation is a major concern, /// a [consistent hashing algorithm][consistent] could be considered, /// but the implementation will still incur the cost of copying /// the [`HashMap`]'s contents to a new location in memory. /// /// [consistent]: https://en.wikipedia.org/wiki/Consistent_hashing #[inline] pub fn with_capacity(capacity: usize) -> Self { let mut indexes = Vec::<&'i Symbol<'i, Ix>>::with_capacity(capacity); // The first index is not used since SymbolId cannot be 0. indexes.push(Ix::dummy_sym()); Self { arena: Bump::new(), indexes: RefCell::new(indexes), map: RefCell::new(HashMap::with_capacity_and_hasher( capacity, Default::default(), )), } } } impl<'i, S, Ix> Interner<'i, Ix> for ArenaInterner<'i, S, Ix> where S: BuildHasher + Default, Ix: SymbolIndexSize, >::Error: Debug, { fn intern(&'i self, value: &str) -> &'i Symbol<'i, Ix> { let mut map = self.map.borrow_mut(); if let Some(sym) = map.get(value) { return sym; } let mut syms = self.indexes.borrow_mut(); let next_index: Ix = syms .len() .try_into() .expect("internal error: SymbolId range exhausted"); // This is not actually unsafe because next_index is always >0 // from initialization. debug_assert!(Ix::new(next_index).is_some()); // != 0 check let id = unsafe { SymbolId::from_int_unchecked(next_index) }; // Copy string slice into the arena. let clone: &'i str = unsafe { &*(std::str::from_utf8_unchecked( self.arena.alloc_slice_clone(value.as_bytes()), ) as *const str) }; // Symbols are also stored within the arena, adjacent to the // string. This ensures that both have stable locations in memory. let sym: &'i Symbol<'i, Ix> = self.arena.alloc(Symbol::new(id, clone)); map.insert(clone, sym); syms.push(sym); sym } #[inline] fn intern_soft(&'i self, value: &str) -> Option<&'i Symbol<'i, Ix>> { self.map.borrow().get(value).map(|sym| *sym) } #[inline] fn contains(&self, value: &str) -> bool { self.map.borrow().contains_key(value) } #[inline] fn len(&self) -> usize { self.map.borrow().len() } fn index_lookup( &'i self, index: SymbolId, ) -> Option<&'i Symbol<'i, Ix>> { self.indexes.borrow().get(index.as_usize()).map(|sym| *sym) } } /// Interner using the [Fx Hash][fxhash] hashing function. /// /// _This is currently the hash function used by [`DefaultInterner`]._ /// /// If denial of service is not a concern, /// then this will outperform the default /// [`DefaultHasher`](std::collections::hash_map::DefaultHasher) /// (which uses SipHash at the time of writing). /// /// See intern benchmarks for a comparison. pub type FxArenaInterner<'i, Ix> = ArenaInterner<'i, FxBuildHasher, Ix>; /// Recommended [`Interner`] and configuration. /// /// The choice of this default relies on the assumption that /// denial-of-service attacks against the hash function are not a /// concern. /// /// For more information on the hashing algorithm, /// see [`FxArenaInterner`]. pub type DefaultInterner<'i, Ix> = FxArenaInterner<'i, Ix>; /// Interner for individual packages and their dependencies. /// /// This type should be preferred to [`DefaultPkgInterner`] when only a /// single package's symbols are being processed, /// since it can be better packed into structs. pub type DefaultPkgInterner<'i> = DefaultInterner<'i, global::PkgSymSize>; /// Interner for entire programs. /// /// This interner holds symbols with a larger underyling datatype than /// [`DefaultPkgInterner`]. /// It is intended for use by linkers or anything else that needs to process /// a large number of packages in a program simultaneously. pub type DefaultProgInterner<'i> = DefaultInterner<'i, global::ProgSymSize>; #[cfg(test)] mod test { use super::*; type Sut<'i> = DefaultInterner<'i, global::ProgSymSize>; #[test] fn recognizes_equal_strings() { let a = "foo"; let b = a.to_string(); let c = "bar"; let d = c.to_string(); let sut = Sut::new(); let (ia, ib, ic, id) = (sut.intern(a), sut.intern(&b), sut.intern(c), sut.intern(&d)); assert_eq!(ia, ib); assert_eq!(&ia, &ib); assert_eq!(*ia, *ib); assert_eq!(ic, id); assert_eq!(&ic, &id); assert_eq!(*ic, *id); assert_ne!(ia, ic); assert_ne!(&ia, &ic); assert_ne!(*ia, *ic); } #[test] fn symbol_id_increases_with_each_new_intern() { let sut = Sut::new(); // Remember that identifiers begin at 1 assert_eq!( SymbolId::from_int(1), sut.intern("foo").index(), "First index should be 1" ); assert_eq!( SymbolId::from_int(1), sut.intern("foo").index(), "Index should not increment for already-interned symbols" ); assert_eq!( SymbolId::from_int(2), sut.intern("bar").index(), "Index should increment for new symbols" ); } #[test] fn length_increases_with_each_new_intern() { let sut = Sut::new(); assert_eq!(0, sut.len(), "invalid empty len"); sut.intern("foo"); assert_eq!(1, sut.len(), "increment len"); // duplicate sut.intern("foo"); assert_eq!(1, sut.len(), "do not increment len on duplicates"); sut.intern("bar"); assert_eq!(2, sut.len(), "increment len (2)"); } #[test] fn can_check_wither_string_is_interned() { let sut = Sut::new(); assert!(!sut.contains("foo"), "recognize missing value"); sut.intern("foo"); assert!(sut.contains("foo"), "recognize interned value"); } #[test] fn intern_soft() { let sut = Sut::new(); assert_eq!(None, sut.intern_soft("foo")); let foo = sut.intern("foo"); assert_eq!(Some(foo), sut.intern_soft("foo")); } #[test] fn new_with_capacity() { let n = 512; let sut = Sut::with_capacity(n); // note that this is not publicly available assert!(sut.map.borrow().capacity() >= n); } #[test] fn intern_utf8_unchecked() { let sut = Sut::new(); let a = sut.intern("foo"); let b = unsafe { sut.intern_utf8_unchecked(b"foo") }; assert_eq!(a, b); } #[test] fn lookup_symbol_by_index() { let sut = Sut::new(); // Symbol does not yet exist. assert!(sut.index_lookup(SymbolId::from_int(1)).is_none()); let sym = sut.intern("foo"); assert_eq!(Some(sym), sut.index_lookup(sym.index())); assert_eq!(Some(sym), sut.index_lookup(sym.into())); } }