tamer: sym: Uninterned symbols

This adds support for uninterned symbols.  This came about as I was creating
Xir (not yet committed) where I had to decide if I wanted `SymbolId` for all
values, even though some values (e.g. large text blocks like compiled code
fragments for xmle files) will never be compared, and so would be wastefull
hashed.

Previous IRs used `String`, but that was clumsy; see documentation in this
commit for rationale.
main
Mike Gerwitz 2021-08-13 22:54:04 -04:00
parent a008d11fb3
commit ce233ac01d
4 changed files with 152 additions and 16 deletions

View File

@ -90,6 +90,16 @@ mod interner {
});
}
#[bench]
fn with_all_new_uninterned_1000(bench: &mut Bencher) {
let strs = gen_strs(1000);
bench.iter(|| {
let sut = ArenaInterner::<RandomState, u32>::new();
strs.iter().map(|s| sut.clone_uninterned(&s)).for_each(drop);
});
}
#[bench]
/// This is our baseline with a raw Rc<str>.
fn with_one_new_rc_str_1000_baseline(bench: &mut Bencher) {
@ -133,6 +143,17 @@ mod interner {
});
}
// For comparison with uninterned symbols.
#[bench]
fn with_all_new_owned_string_1000_baseline(bench: &mut Bencher) {
let strs = gen_strs(1000);
bench.iter(|| {
let sut = ArenaInterner::<FxBuildHasher, u32>::new();
strs.iter().map(|s| String::from(s)).for_each(drop);
});
}
#[bench]
fn with_all_new_1000(bench: &mut Bencher) {
let strs = gen_strs(1000);
@ -143,6 +164,16 @@ mod interner {
});
}
#[bench]
fn with_all_new_uninterned_1000(bench: &mut Bencher) {
let strs = gen_strs(1000);
bench.iter(|| {
let sut = ArenaInterner::<FxBuildHasher, u32>::new();
strs.iter().map(|s| sut.clone_uninterned(&s)).for_each(drop);
});
}
#[bench]
/// This is our baseline with a raw Rc<str>.
fn with_one_new_rc_str_1000_baseline(bench: &mut Bencher) {

View File

@ -115,6 +115,19 @@ pub trait Interner<'i, Ix: SymbolIndexSize> {
/// interned.
fn intern_soft(&self, value: &str) -> Option<SymbolId<Ix>>;
/// Copy the provided slice into the intern pool and produce a symbol,
/// but do not intern the symbol.
///
/// The symbol will never compare equal to any other symbol,
/// regardless of the underlying string.
/// Consequently,
/// this evades the cost of hashing the string,
/// allowing for a [`SymbolId`] to be used in place of [`String`].
///
/// See "Uninterned Symbols" in the documentation of the
/// [`sym` module](super) for more information.
fn clone_uninterned(&self, value: &str) -> SymbolId<Ix>;
/// Determine whether the given value has already been interned.
///
/// This is equivalent to `intern_soft(value).is_some()`.
@ -190,6 +203,7 @@ impl<'i, S, Ix> ArenaInterner<'i, S, Ix>
where
S: BuildHasher + Default,
Ix: SymbolIndexSize,
<Ix as TryFrom<usize>>::Error: Debug,
{
/// Initialize a new interner with no initial capacity.
///
@ -230,6 +244,28 @@ where
)),
}
}
#[inline]
fn get_next_symbol_id(syms: &mut Vec<&'i str>) -> SymbolId<Ix> {
let next_index: Ix = syms
.len()
.try_into()
.expect("internal error: SymbolId range exhausted");
// This is not actually unsafe because next_index is always >0
// from initialization.
debug_assert!(Ix::new(next_index).is_some()); // != 0 check
unsafe { SymbolId::from_int_unchecked(next_index) }
}
#[inline]
fn copy_slice_into_arena(&self, value: &str) -> &'i str {
unsafe {
&*(std::str::from_utf8_unchecked(
self.arena.alloc_slice_clone(value.as_bytes()),
) as *const str)
}
}
}
impl<'i, S, Ix> Interner<'i, Ix> for ArenaInterner<'i, S, Ix>
@ -247,22 +283,8 @@ where
let mut syms = self.strings.borrow_mut();
let next_index: Ix = syms
.len()
.try_into()
.expect("internal error: SymbolId range exhausted");
// This is not actually unsafe because next_index is always >0
// from initialization.
debug_assert!(Ix::new(next_index).is_some()); // != 0 check
let id = unsafe { SymbolId::from_int_unchecked(next_index) };
// Copy string slice into the arena.
let clone: &'i str = unsafe {
&*(std::str::from_utf8_unchecked(
self.arena.alloc_slice_clone(value.as_bytes()),
) as *const str)
};
let id = Self::get_next_symbol_id(&mut syms);
let clone = self.copy_slice_into_arena(value);
map.insert(clone, id);
syms.push(clone);
@ -275,6 +297,15 @@ where
self.map.borrow().get(value).map(|sym| *sym)
}
fn clone_uninterned(&self, value: &str) -> SymbolId<Ix> {
let mut syms = self.strings.borrow_mut();
let id = Self::get_next_symbol_id(&mut syms);
syms.push(self.copy_slice_into_arena(value));
id
}
#[inline]
fn contains(&self, value: &str) -> bool {
self.map.borrow().contains_key(value)
@ -415,6 +446,42 @@ mod test {
assert_eq!(Some(foo), sut.intern_soft("foo"));
}
#[test]
fn uninterned_symbol_does_not_compare_equal_to_same_string() {
let sut = Sut::new();
let s = "foo";
let interned = sut.intern(s);
let uninterned = sut.clone_uninterned(s);
// The symbols themselves will never be equal...
assert_ne!(uninterned, interned);
// ...but their underlying strings are.
assert_eq!(sut.index_lookup(uninterned), sut.index_lookup(interned));
}
// Unlike the previous test, this makes sure that allocating an
// uninterned symbol is actually not being interned, in that interning
// another symbol after that won't return an uninterned symbol.
#[test]
fn allocating_uninterned_symbol_does_not_intern() {
let sut = Sut::new();
let s = "foo";
// Alloc unintenrned _first_
let uninterned1 = sut.clone_uninterned(s);
let uninterned2 = sut.clone_uninterned(s);
let interned1 = sut.intern(s);
let interned2 = sut.intern(s);
assert_ne!(uninterned1, interned1);
assert_ne!(uninterned2, interned1);
assert_ne!(uninterned1, uninterned2);
// But we shouldn't have tainted normal interner behavior.
assert_eq!(interned1, interned2);
}
#[test]
fn new_with_capacity() {
let n = 512;

View File

@ -218,6 +218,25 @@
//! if you utilize interners for any other purpose,
//! it is advised that you create newtypes for their [`SymbolId`]s.
//!
//! Uninterned Symbols
//! ------------------
//! Interners are able to allocate a [`SymbolId`] without interning,
//! which will produce a symbol that cannot compare equal to any other
//! symbol and avoids the hashing cost required to perform interning.
//! This is useful for a couple of reasons:
//!
//! 1. To create a symbol that is guaranteed to be unique,
//! even if the same string value was previously interned; and
//! 2. To store a string without a hashing cost,
//! making [`SymbolId`] a suitable substitute for [`String`] when the
//! string will never need the benefits of internment.
//!
//! The second option allows all data structures to consistently carry
//! [`SymbolId`] and let the owner of those data decide whether it is
//! appropriate to incur a hashing cost;
//! using [`String`] forces that decision upon users of the data
//! structure,
//! and also makes for an awkward and confusing API.
//!
//! Related Work and Further Reading
//! ================================

View File

@ -337,7 +337,16 @@ impl<Ix: SymbolIndexSize> GlobalSymbolResolve for SymbolId<Ix> {
/// Rust is able to infer this itself and so it looks quite natural.
pub trait GlobalSymbolIntern<Ix: SymbolIndexSize> {
/// Intern a string using a global interner.
///
/// See [`crate::sym`] for more information.
fn intern(self) -> SymbolId<Ix>;
/// Copy the provided slice into the intern pool and produce a symbol
/// using a global interner,
/// but do not intern the symbol.
///
/// See [`crate::sym`] for more information.
fn clone_uninterned(self) -> SymbolId<Ix>;
}
/// Intern a byte slice using a global interner.
@ -363,6 +372,10 @@ impl<Ix: SymbolIndexSize> GlobalSymbolIntern<Ix> for &str {
fn intern(self) -> SymbolId<Ix> {
Ix::with_static_interner(|interner| interner.intern(self))
}
fn clone_uninterned(self) -> SymbolId<Ix> {
Ix::with_static_interner(|interner| interner.clone_uninterned(self))
}
}
impl<Ix: SymbolIndexSize> GlobalSymbolInternUnchecked<Ix> for &[u8] {
@ -428,5 +441,11 @@ mod test {
);
});
}
#[test]
fn clone_uninterned() {
let sym: PkgSymbolId = "foo".clone_uninterned();
assert_eq!("foo", sym.lookup_str());
}
}
}