tame/tamer/src/sym/interner.rs

637 lines
20 KiB
Rust
Raw Blame History

This file contains invisible Unicode characters!

This file contains invisible Unicode characters that may be processed differently from what appears below. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to reveal hidden characters.

// String interner
//
// Copyright (C) 2014-2023 Ryan Specialty, LLC.
//
// This file is part of TAME.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//! Interners used to intern values as symbols.
//!
//! See the [parent module](super) for more information.
//!
//!
//! Using Interners Directly (Without Global State)
//! ===============================================
//! Please do not do this unless you have a compelling use case and know
//! what you are doing,
//! including understanding how to mitigate mixing of [`SymbolId`]s,
//! such as with newtypes or encapsulation.
//! Otherwise,
//! use the global interners instead,
//! as documented in the [parent module](super).
//!
//! ```
//! use tamer::sym::{DefaultProgInterner, Interner, SymbolId};
//!
//! // Inputs to be interned
//! let a = "foo";
//! let b = &"foo".to_string();
//! let c = "foobar";
//! let d = &c[0..3];
//!
//! // Interners employ interior mutability and so do not need to be
//! // declared `mut`
//! let interner = DefaultProgInterner::new();
//!
//! let (ia, ib, ic, id) = (
//! interner.intern(a),
//! interner.intern(b),
//! interner.intern(c),
//! interner.intern(d),
//! );
//!
//! assert_eq!(ia, ib);
//! assert_eq!(ia, id);
//! assert_eq!(ib, id);
//! assert_ne!(ia, ic);
//!
//! // Only "foo" and "foobar" are interned
//! assert_eq!(2, interner.len());
//! assert!(interner.contains("foo"));
//! assert!(interner.contains("foobar"));
//! assert!(!interner.contains("something else"));
//!
//! // Symbols can also be looked up by index.
//! assert_eq!("foo", interner.index_lookup(ia).unwrap());
//! ```
use super::{SymbolId, SymbolIndexSize};
use crate::global;
use bumpalo::Bump;
use fxhash::FxBuildHasher;
use std::cell::RefCell;
use std::collections::HashMap;
use std::convert::{TryFrom, TryInto};
use std::fmt::Debug;
use std::hash::BuildHasher;
use std::str::{from_utf8, from_utf8_unchecked, Utf8Error};
/// Create, store, compare, and retrieve interned values.
///
/// Interners accept string slices and produce values of type [`SymbolId`].
/// The same [`SymbolId`] will always be returned for a given string,
/// allowing symbols to be compared for equality cheaply by comparing
/// integers.
/// Symbol locations in memory are fixed for the lifetime of the interner,
/// and can be retrieved as [`str`] using
/// [`index_lookup`](Interner::index_lookup).
///
/// If you care whether a value has been interned yet or not,
/// see [`intern_soft`][Interner::intern_soft`] and
/// [`contains`](Interner::contains).
///
/// See the [module-level documentation](self) for an example.
/// For interfaces to the global interners that indirectly use these
/// methods,
/// see the [parent module](super).
pub trait Interner<'i, Ix: SymbolIndexSize> {
/// Intern a string slice or return an existing [`SymbolId`].
///
/// If the provided string has already been interned,
/// then an existing [`SymbolId`] will be returned.
/// Otherwise,
/// the string will be interned and a new [`SymbolId`] allocated.
///
/// To retrieve an existing symbol _without_ interning,
/// see [`intern_soft`](Interner::intern_soft).
fn intern(&self, value: &str) -> SymbolId<Ix>;
/// Retrieve an existing intern for the provided string slice.
///
/// Unlike [`intern`](Interner::intern),
/// this will _not_ intern the string if it has not already been
/// interned.
fn intern_soft(&self, value: &str) -> Option<SymbolId<Ix>>;
/// Copy the provided slice into the intern pool and produce a symbol,
/// but do not intern the symbol.
///
/// The symbol will never compare equal to any other symbol,
/// regardless of the underlying string.
/// Consequently,
/// this evades the cost of hashing the string,
/// allowing for a [`SymbolId`] to be used in place of [`String`].
///
/// See "Uninterned Symbols" in the documentation of the
/// [`sym` module](super) for more information.
fn clone_uninterned(&self, value: &str) -> SymbolId<Ix>;
/// Determine whether the given value has already been interned.
///
/// This is equivalent to `intern_soft(value).is_some()`.
fn contains(&self, value: &str) -> bool;
/// Number of interned strings in this interner's pool.
///
/// This count will increase each time a unique string is interned.
/// It does not increase when a string is already interned.
fn len(&self) -> usize;
/// Look up a symbol's string value by its [`SymbolId`].
///
/// This will always return a [`str`] as long as the provided
/// `index` represents a symbol interned with this interner.
/// If the index is not found,
/// the result is [`None`].
///
/// [`str`] requires significantly more storage than an appropriate
/// [`SymbolId`] and should only be used when a string value must be
/// written (e.g. to a file or displayed to the user).
fn index_lookup(&'i self, index: SymbolId<Ix>) -> Option<&'i str>;
/// Intern a byte slice as a UTF-8 string.
///
/// This method is intended as a performance optimization to avoid
/// unnecessary UTF-8 checks when a byte slice has already been
/// interned.
///
/// This first checks to see if the provided byte slice matches an
/// existing intern,
/// returning the symbol if found.
/// This allows us to skip the cost of a UTF-8 check for strings that
/// have already been encountered,
/// since their presence in the pool means that the string was
/// either
/// (a) already interned as a valid UTF-8 string; or
/// (b) was interned using an unsafe function.
/// In the case of (b),
/// the safety violation is the fault of the original caller,
/// and there's nothing we can do about it now.
///
/// Note that this optimization is only beneficial when a string has
/// already been interned.
/// To avoid the cost of UTF-8 checks entirely,
/// see [`Interner::intern_utf8_unchecked`].
///
/// If the byte slice does not represent a valid UTF-8 string,
/// a [`Utf8Error`] will be returned along with a reference to the
/// provided byte string.
/// The purpose of this pair is to simplify error conversions
/// using `?` so that errors can contain additional context.
fn intern_utf8<'a>(
&self,
value: &'a [u8],
) -> Result<SymbolId<Ix>, (Utf8Error, &'a [u8])>;
/// Intern an assumed-UTF-8 slice of bytes or return an existing
/// [`SymbolId`].
///
/// Safety
/// ======
/// This function is unsafe because it uses [`from_utf8_unchecked`].
/// It is provided for convenience when interning from trusted binary
/// data
/// (such as [object files][]).
///
/// [object files]: crate::obj
unsafe fn intern_utf8_unchecked(&self, value: &[u8]) -> SymbolId<Ix> {
self.intern(from_utf8_unchecked(value))
}
/// Copy the provided assumed-UTF-8 slice of bytes into the intern pool
/// and produce a symbol,
/// but do not intern the symbol.
///
/// See [`clone_uninterned`](Interner::clone_uninterned) for more
/// information.
///
/// Safety
/// ======
/// This function is unsafe because it uses [`from_utf8_unchecked`].
/// It is provided for convenience when interning from trusted binary
/// data
/// (such as [object files][]).
///
/// [object files]: crate::obj
unsafe fn clone_uninterned_utf8_unchecked(
&self,
value: &[u8],
) -> SymbolId<Ix> {
self.clone_uninterned(from_utf8_unchecked(value))
}
}
/// An interner backed by an [arena](bumpalo).
///
/// Since all symbols exist until the interner itself is freed,
/// an arena is a much more efficient and appropriate memory allocation
/// strategy.
/// This also provides a stable location in memory for symbol data.
///
/// For the recommended configuration,
/// see [`DefaultInterner`].
///
/// See the [module-level documentation](self) for examples and more
/// information on how to use this interner.
pub struct ArenaInterner<'i, S, Ix = global::ProgSymSize>
where
S: BuildHasher + Default,
Ix: SymbolIndexSize,
{
/// Storage for interned strings.
arena: Bump,
/// Interned strings by [`SymbolId`].
///
/// The first index must always be populated during initialization to
/// ensure that [`SymbolId`] will never be `0`.
///
/// These string slices are stored in `arena`.
strings: RefCell<Vec<&'i str>>,
/// Map of interned strings to their respective [`SymbolId`].
///
/// This allows us to determine whether a string has already been
/// interned and, if so, to return its corresponding symbol.
map: RefCell<HashMap<&'i str, SymbolId<Ix>, S>>,
}
impl<'i, S, Ix> ArenaInterner<'i, S, Ix>
where
S: BuildHasher + Default,
Ix: SymbolIndexSize,
<Ix as TryFrom<usize>>::Error: Debug,
{
/// Initialize a new interner with no initial capacity.
///
/// Prefer [`with_capacity`](ArenaInterner::with_capacity) when possible.
#[inline]
pub fn new() -> Self {
Self::with_capacity(0)
}
/// Initialize a new interner with an initial capacity for the
/// underlying [`HashMap`].
///
/// The given `capacity` has no affect on arena allocation.
/// Specifying initial capacity is important only for the map of strings
/// to symbols because it will reallocate and re-hash its contents
/// once capacity is exceeded.
/// See benchmarks.
///
/// If reallocation is a major concern,
/// a [consistent hashing algorithm][consistent] could be considered,
/// but the implementation will still incur the cost of copying
/// the [`HashMap`]'s contents to a new location in memory.
///
/// [consistent]: https://en.wikipedia.org/wiki/Consistent_hashing
#[inline]
pub fn with_capacity(capacity: usize) -> Self {
let mut strings = Vec::<_>::with_capacity(capacity);
// The first index is not used since SymbolId cannot be 0.
strings.push("");
Self {
arena: Bump::new(),
strings: RefCell::new(strings),
map: RefCell::new(HashMap::with_capacity_and_hasher(
capacity,
Default::default(),
)),
}
}
#[inline]
fn get_next_symbol_id(syms: &mut Vec<&'i str>) -> SymbolId<Ix> {
let next_index: Ix = syms
.len()
.try_into()
.expect("internal error: SymbolId range exhausted");
// This is not actually unsafe because next_index is always >0
// from initialization.
debug_assert!(Ix::new(next_index).is_some()); // != 0 check
unsafe { SymbolId::from_int_unchecked(next_index) }
}
#[inline]
fn copy_slice_into_arena(&self, value: &str) -> &'i str {
unsafe {
&*(from_utf8_unchecked(
self.arena.alloc_slice_clone(value.as_bytes()),
) as *const str)
}
}
/// Intern the provided value without looking for an existing intern.
///
/// _This is an internal function that should only be used after having
/// already checked for an existing intern._
/// It exists only to share common logic across methods.
#[inline]
fn intern_without_lookup(&self, value: &str) -> SymbolId<Ix> {
let mut syms = self.strings.borrow_mut();
let id = Self::get_next_symbol_id(&mut syms);
let clone = self.copy_slice_into_arena(value);
self.map.borrow_mut().insert(clone, id);
syms.push(clone);
id
}
}
impl<'i, S, Ix> Interner<'i, Ix> for ArenaInterner<'i, S, Ix>
where
S: BuildHasher + Default,
Ix: SymbolIndexSize,
<Ix as TryFrom<usize>>::Error: Debug,
{
fn intern(&self, value: &str) -> SymbolId<Ix> {
if let Some(sym) = self.map.borrow().get(value) {
return *sym;
}
self.intern_without_lookup(value)
}
#[inline]
fn intern_soft(&self, value: &str) -> Option<SymbolId<Ix>> {
self.map.borrow().get(value).map(|sym| *sym)
}
fn clone_uninterned(&self, value: &str) -> SymbolId<Ix> {
let mut syms = self.strings.borrow_mut();
let id = Self::get_next_symbol_id(&mut syms);
syms.push(self.copy_slice_into_arena(value));
id
}
fn intern_utf8<'a>(
&self,
value: &'a [u8],
) -> Result<SymbolId<Ix>, (Utf8Error, &'a [u8])> {
// Check the raw byte slice _before_ performing a UTF-8 check.
// Note that `from_utf8_unchecked` is simply a transmute,
// so this check incurs only a hashing cost.
if let Some(sym) = self.map.borrow().get(
// SAFETY: This is only being used to check if the byte slice
// matches an existing intern, which must them already be UTF-8
// (unless an unsafe method was used to add it to begin with).
unsafe { from_utf8_unchecked(value) },
) {
return Ok(*sym);
}
// The string is not yet interned, so we must perform a UTF-8 check
// and can then proceed to intern as we normally would.
// This does incur a double hashing cost,
// just like `intern`.
Ok(self.intern_without_lookup(
from_utf8(value).map_err(|err| (err, value))?,
))
}
#[inline]
fn contains(&self, value: &str) -> bool {
self.map.borrow().contains_key(value)
}
#[inline]
fn len(&self) -> usize {
self.map.borrow().len()
}
fn index_lookup(&'i self, index: SymbolId<Ix>) -> Option<&'i str> {
self.strings.borrow().get(index.as_usize()).map(|str| *str)
}
}
/// Interner using the [Fx Hash][fxhash] hashing function.
///
/// _This is currently the hash function used by [`DefaultInterner`]._
///
/// If denial of service is not a concern,
/// then this will outperform the default
/// [`DefaultHasher`](std::collections::hash_map::DefaultHasher)
/// (which uses SipHash at the time of writing).
///
/// See intern benchmarks for a comparison.
pub type FxArenaInterner<'i, Ix = global::ProgSymSize> =
ArenaInterner<'i, FxBuildHasher, Ix>;
/// Recommended [`Interner`] and configuration (size-agnostic).
///
/// The choice of this default relies on the assumption that
/// denial-of-service attacks against the hash function are not a
/// concern.
///
/// For more information on the hashing algorithm,
/// see [`FxArenaInterner`].
pub type DefaultInterner<'i, Ix = global::ProgSymSize> =
FxArenaInterner<'i, Ix>;
/// Recommended [`Interner`] and configuration for compilers and linkers
/// processing one or more packages.
pub type DefaultProgInterner<'i> = DefaultInterner<'i, global::ProgSymSize>;
// Note that these tests assert on standalone interners, not on the globals;
// see the `global` sibling package for those tests.
#[cfg(test)]
mod test {
use super::*;
type Sut<'i> = DefaultInterner<'i>;
#[test]
fn recognizes_equal_strings() {
let a = "foo";
let b = a.to_string();
let c = "bar";
let d = c.to_string();
let sut = Sut::new();
let (ia, ib, ic, id) =
(sut.intern(a), sut.intern(&b), sut.intern(c), sut.intern(&d));
assert_eq!(ia, ib);
assert_eq!(ic, id);
assert_ne!(ia, ic);
}
#[test]
fn symbol_id_increases_with_each_new_intern() {
let sut = Sut::new();
// Remember that identifiers begin at 1
assert_eq!(
SymbolId::test_from_int(1),
sut.intern("foo"),
"First index should be 1"
);
assert_eq!(
SymbolId::test_from_int(1),
sut.intern("foo"),
"Index should not increment for already-interned symbols"
);
assert_eq!(
SymbolId::test_from_int(2),
sut.intern("bar"),
"Index should increment for new symbols"
);
}
#[test]
fn length_increases_with_each_new_intern() {
let sut = Sut::new();
assert_eq!(0, sut.len(), "invalid empty len");
sut.intern("foo");
assert_eq!(1, sut.len(), "increment len");
// duplicate
sut.intern("foo");
assert_eq!(1, sut.len(), "do not increment len on duplicates");
sut.intern("bar");
assert_eq!(2, sut.len(), "increment len (2)");
}
#[test]
fn can_check_wither_string_is_interned() {
let sut = Sut::new();
assert!(!sut.contains("foo"), "recognize missing value");
sut.intern("foo");
assert!(sut.contains("foo"), "recognize interned value");
}
#[test]
fn intern_soft() {
let sut = Sut::new();
assert_eq!(None, sut.intern_soft("foo"));
let foo = sut.intern("foo");
assert_eq!(Some(foo), sut.intern_soft("foo"));
}
#[test]
fn uninterned_symbol_does_not_compare_equal_to_same_string() {
let sut = Sut::new();
let s = "foo";
let interned = sut.intern(s);
let uninterned = sut.clone_uninterned(s);
// The symbols themselves will never be equal...
assert_ne!(uninterned, interned);
// ...but their underlying strings are.
assert_eq!(sut.index_lookup(uninterned), sut.index_lookup(interned));
}
// Unlike the previous test, this makes sure that allocating an
// uninterned symbol is actually not being interned, in that interning
// another symbol after that won't return an uninterned symbol.
#[test]
fn allocating_uninterned_symbol_does_not_intern() {
let sut = Sut::new();
let s = "foo";
// Alloc unintenrned _first_
let uninterned1 = sut.clone_uninterned(s);
let uninterned2 = sut.clone_uninterned(s);
let interned1 = sut.intern(s);
let interned2 = sut.intern(s);
assert_ne!(uninterned1, interned1);
assert_ne!(uninterned2, interned1);
assert_ne!(uninterned1, uninterned2);
// But we shouldn't have tainted normal interner behavior.
assert_eq!(interned1, interned2);
}
#[test]
fn new_with_capacity() {
let n = 512;
let sut = Sut::with_capacity(n);
// note that this is not publicly available
assert!(sut.map.borrow().capacity() >= n);
}
#[test]
fn intern_utf8_unchecked() {
let sut = Sut::new();
let a = sut.intern("foo");
let b = unsafe { sut.intern_utf8_unchecked(b"foo") };
assert_eq!(a, b);
}
#[test]
fn lookup_symbol_by_index() {
let sut = Sut::new();
// Symbol does not yet exist.
assert!(sut.index_lookup(SymbolId::test_from_int(1)).is_none());
let sym = sut.intern("foo");
assert_eq!("foo", sut.index_lookup(sym).unwrap());
}
#[test]
fn intern_utf8_with_new_valid_utf8_bytes() {
let sut = Sut::new();
let bytes = "valid".as_bytes();
let sym = sut.intern_utf8(bytes).expect("unexpected failure");
assert_eq!(sut.intern("valid"), sym);
}
#[test]
fn intern_utf8_with_existing_valid_utf8_bytes() {
let sut = Sut::new();
let s = "valid";
// Intern normally _first_ so that the `intern_utf8` call will
// return an existing intern.
sut.intern(s);
let sym = sut.intern_utf8(s.as_bytes()).expect("unexpected failure");
assert_eq!(sut.intern("valid"), sym);
}
#[test]
fn intern_utf8_fails_with_invalid_utf8_bytes() {
let sut = Sut::new();
// Invalid two-byte encoding.
let bytes = &[0b11000000u8];
let result = sut.intern_utf8(bytes);
match (result, from_utf8(bytes)) {
(_, Ok(_)) => panic!("test string is valid UTF-8"),
(Ok(_), _) => panic!("expected error"),
(Err((given_err, given_u8)), Err(expected)) => {
assert_eq!(given_u8, bytes);
assert_eq!(given_err, expected);
}
}
}
}