TAMER: Arena-based string interner

Contrary to what I said previously, this replaces the previous
implementation with an arena-backed internment system.  The motivation for
this change was investigating how Rustc performed its string interning, and
why they chose to associate integer identifiers with symbols.

The intent was originally to use Rustc's arena allocator directly, but that
create pulled in far too many dependencies and depended on nightly
Rust.  Bumpalo provides a very similar implementation to Rustc's
DroplessArena, so I went with that instead.

Rustc also relies on a global, singleton interner.  I do not do that
here.  Instead, the returned Symbol carries a lifetime of the underlying
arena, as well as a pointer to the interned string.

Now that this is put to rest, it's time to move on.
master
Mike Gerwitz 2019-12-23 23:26:42 -05:00
parent 176d099fb6
commit 1f4db84f24
4 changed files with 400 additions and 853 deletions

7
tamer/Cargo.lock generated
View File

@ -1,5 +1,10 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "bumpalo"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "byteorder"
version = "1.3.2"
@ -49,6 +54,7 @@ dependencies = [
name = "tamer"
version = "0.0.0"
dependencies = [
"bumpalo 2.6.0 (registry+https://github.com/rust-lang/crates.io-index)",
"fixedbitset 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
"fxhash 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"petgraph 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)",
@ -56,6 +62,7 @@ dependencies = [
]
[metadata]
"checksum bumpalo 2.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ad807f2fc2bf185eeb98ff3a901bd46dc5ad58163d0fa4577ba0d25674d71708"
"checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5"
"checksum fixedbitset 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "86d4de0081402f5e88cdac65c8dcdcc73118c1a7a465e2a05f0da05843a8ea33"
"checksum fxhash 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"

View File

@ -23,8 +23,9 @@ lto = true
lto = true
[dependencies]
bumpalo = ">= 2.6.0"
# used by petgraph
fixedbitset = ">= 0.1"
fxhash = ">= 0.2.1"
petgraph = ">= 0.4.13"
quick-xml = ">= 0.17.0"
# used by petgraph
fixedbitset = ">= 0.1"

View File

@ -28,67 +28,13 @@ use std::rc::Rc;
use tamer::sym::*;
use test::Bencher;
mod symbol {
use super::*;
/// This is our baseline. We should never be any slower than this.
#[bench]
fn new_raw_rc_1000_baseline(bench: &mut Bencher) {
let s = "foo bar baz";
bench.iter(|| {
(0..1000)
.map(|_| {
let _: Rc<str> = s.into();
})
.for_each(drop);
});
}
/// Using the `SymbolRc` wrapper should perform no differently than the
/// above test with `Rc<str>`.
#[bench]
fn new_symbol_rc_1000(bench: &mut Bencher) {
let s = "foo bar baz";
bench.iter(|| {
(0..1000).map(|_| SymbolRc::new(s)).for_each(drop);
});
}
/// Rc uses pointer comparisons when possible. We want to match that.
#[bench]
fn eq_check_rc_baseline(bench: &mut Bencher) {
bench.iter(|| {
let a: Rc<str> = "foobarbazquux".into();
let b: Rc<str> = "foobarbazquux".into();
let c: Rc<str> = "foobarbazquuxx".into();
let _ = a == b;
let _ = a == c;
});
}
#[bench]
fn eq_check_symbol_rc_baseline(bench: &mut Bencher) {
bench.iter(|| {
let a: SymbolRc = "foobarbazquux".into();
let b: SymbolRc = "foobarbazquux".into();
let c: SymbolRc = "foobarbazquuxx".into();
let _ = a == b;
let _ = a == c;
});
}
}
fn gen_strs(n: usize) -> Vec<String> {
(0..n)
.map(|n| n.to_string() + "foobarbazquuxlongsymbol")
.collect()
}
mod hash_set {
mod interner {
use super::*;
use std::collections::hash_map::RandomState;
use std::collections::HashSet;
@ -137,7 +83,7 @@ mod hash_set {
let strs = gen_strs(1000);
bench.iter(|| {
let mut sut = HashSetInterner::<SymbolRc>::new();
let sut = ArenaInterner::<RandomState>::new();
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
});
}
@ -154,7 +100,7 @@ mod hash_set {
#[bench]
fn with_one_new_1000(bench: &mut Bencher) {
bench.iter(|| {
let mut sut = HashSetInterner::<SymbolRc>::new();
let sut = ArenaInterner::<RandomState>::new();
(0..1000).map(|_| sut.intern("first")).for_each(drop);
});
}
@ -178,7 +124,7 @@ mod hash_set {
let strs = gen_strs(1000);
bench.iter(|| {
let mut sut = HashSetInterner::<SymbolRc, FxBuildHasher>::new();
let sut = ArenaInterner::<FxBuildHasher>::new();
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
});
}
@ -197,7 +143,7 @@ mod hash_set {
#[bench]
fn with_one_new_1000(bench: &mut Bencher) {
bench.iter(|| {
let mut sut = HashSetInterner::<SymbolRc, FxBuildHasher>::new();
let sut = ArenaInterner::<FxBuildHasher>::new();
(0..1000).map(|_| sut.intern("first")).for_each(drop);
});
}
@ -210,176 +156,9 @@ mod hash_set {
let strs = gen_strs(n);
bench.iter(|| {
let mut sut =
HashSetInterner::<SymbolRc, FxBuildHasher>::with_capacity(
n,
);
let sut = ArenaInterner::<FxBuildHasher>::with_capacity(n);
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
});
}
}
}
mod hash_map {
use super::*;
use std::collections::hash_map::{Entry, RandomState};
use std::collections::HashMap;
use std::hash::BuildHasher;
pub struct HashMapSut<M, S = RandomState>
where
S: BuildHasher,
{
pub map: HashMap<Rc<str>, M, S>,
}
impl<M, S> HashMapSut<M, S>
where
M: Default,
S: BuildHasher + Default,
{
#[inline]
fn new() -> Self {
Self {
map: HashMap::with_hasher(Default::default()),
}
}
pub fn intern(&mut self, value: &str) -> Rc<str> {
match self.map.entry(value.into()) {
Entry::Vacant(v) => {
let intern = v.key().clone();
v.insert(Default::default());
intern
}
Entry::Occupied(o) => o.key().clone(),
}
}
}
/// This is our baseline with a raw Rc<str>.
#[bench]
fn with_all_new_rc_str_1000_baseline(bench: &mut Bencher) {
let strs = gen_strs(1000);
bench.iter(|| {
let mut sut = HashMapSut::<(), RandomState>::new();
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
});
}
#[bench]
fn with_all_new_1000(bench: &mut Bencher) {
let strs = gen_strs(1000);
bench.iter(|| {
let mut sut = HashMapInterner::<SymbolRc, ()>::new();
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
});
}
#[bench]
/// This is our baseline with a raw Rc<str>.
fn with_one_new_rc_str_1000_baseline(bench: &mut Bencher) {
bench.iter(|| {
let mut sut = HashMapSut::<(), RandomState>::new();
(0..1000).map(|_| sut.intern("first")).for_each(drop);
});
}
#[bench]
fn with_one_new_1000(bench: &mut Bencher) {
bench.iter(|| {
let mut sut = HashMapInterner::<SymbolRc, ()>::new();
(0..1000).map(|_| sut.intern("first")).for_each(drop);
});
}
mod fx {
use super::*;
use fxhash::FxBuildHasher;
/// This is our baseline with a raw Rc<str>.
#[bench]
fn with_all_new_rc_str_1000_baseline(bench: &mut Bencher) {
let strs = gen_strs(1000);
bench.iter(|| {
let mut sut = HashMapSut::<(), FxBuildHasher>::new();
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
});
}
#[bench]
fn with_all_new_1000(bench: &mut Bencher) {
let strs = gen_strs(1000);
bench.iter(|| {
let mut sut =
HashMapInterner::<SymbolRc, (), FxBuildHasher>::new();
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
});
}
#[bench]
/// This is our baseline with a raw Rc<str>.
fn with_one_new_rc_str_1000_baseline(bench: &mut Bencher) {
bench.iter(|| {
let mut sut: HashMapSut<(), FxBuildHasher> = HashMapSut {
map: HashMap::with_hasher(Default::default()),
};
(0..1000).map(|_| sut.intern("first")).for_each(drop);
});
}
#[bench]
fn with_one_new_1000(bench: &mut Bencher) {
bench.iter(|| {
let mut sut =
HashMapInterner::<SymbolRc, (), FxBuildHasher>::new();
(0..1000).map(|_| sut.intern("first")).for_each(drop);
});
}
/// Since FNV is the best-performing, let's build upon it to demonstrate
/// the benefits of with_capacity
#[bench]
fn with_all_new_1000_with_capacity(bench: &mut Bencher) {
let n = 1000;
let strs = gen_strs(n);
bench.iter(|| {
let mut sut =
HashMapInterner::<SymbolRc, (), FxBuildHasher>::with_capacity(n);
strs.iter().map(|s| sut.intern(&s)).for_each(drop);
});
}
#[bench]
fn with_all_new_meta_1000(bench: &mut Bencher) {
let strs = gen_strs(1000);
bench.iter(|| {
let mut sut =
HashMapInterner::<SymbolRc, u8, FxBuildHasher>::new();
strs.iter().map(|s| sut.intern_meta(&s, 0)).for_each(drop);
});
}
#[bench]
fn with_all_new_then_set_meta_1000(bench: &mut Bencher) {
let strs = gen_strs(1000);
bench.iter(|| {
let mut sut =
HashMapInterner::<SymbolRc, u8, FxBuildHasher>::new();
strs.iter()
.map(|s| {
sut.intern(&s);
sut.intern_meta(&s, 0);
})
.for_each(drop);
});
}
}
}

File diff suppressed because it is too large Load Diff